{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data exploration of the articles (Automed data)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Articles found (eliminate duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read csv\n",
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 831000 articles\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get only the pubmed id\n",
    "df_p = df.loc[:,'AKE_pubmed_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 44 duplciated articles\n",
    "duplicated_rows = df_p[df_p.duplicated(keep=False)]\n",
    "duplicated_rows.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Threre are duplicates max 2\n",
    "df_p.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the duplicated articles information\n",
    "# CHECK THE DUPLCIATES IN PUBMED WEB\n",
    "df.iloc[duplicated_rows.index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rows that will be eliminated because of duplicate\n",
    "# PROBLEMS: same pmcid, erratum, retracted.\n",
    "mask = [7016, 28652, 34115, 36974, 101609, 134736, 209387, 237868, 270683, 302438, 308649, 349159, 427168, 444565, 481005, 523773, 527998, 581868, 719666, 726571, 773107, 817015]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop the duplicates and save it\n",
    "df = df.drop(mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save data set after eliminating the duplicates\n",
    "df.to_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save data set after eliminating the duplicates\n",
    "df.to_csv('./Dataset/Complete/data_pubmed_all.csv', index=False)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Articles not found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_not_found = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_not_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_not_found.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_not_found.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 11372 DUPLCIATED ARTICLES\n",
    "duplicated_rows = df_not_found[df_not_found['pcmid_AKE'].duplicated(keep=False)]\n",
    "duplicated_rows.index"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read csv\n",
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get columns that we want: date, pmcid, title, abstract, keywords, journal\n",
    "df_p = df.loc[:,['publication_date',\n",
    "       'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n",
    "       'AKE_keywords', 'journal']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p['journal'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Most recent paper 2022-12-13\n",
    "df_p['publication_date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get papers that are 2018 forward, papers that are in the 5 years range \n",
    "df_p = df_p[df_p['publication_date'] > '2018']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p['journal'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count the number of occurrences of each unique element\n",
    "counts = df_p['journal'].value_counts()\n",
    "\n",
    "# Filter the dataframe based on the count\n",
    "filtered_df = df_p[df_p['journal'].isin(counts[counts > 200].index)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_df['journal'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "ast.literal_eval(filtered_df.iloc[0]['AKE_keywords'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the filtered data set\n",
    "filtered_df.to_csv('Data/Complete/data_pubmed.csv', index=False)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split data set to 60% train 20% validation and 20% test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/Complete/data_pubmed.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the dataset by journal\n",
    "journals = df['journal'].unique()\n",
    "train_ratio = 0.6\n",
    "\n",
    "train_data = pd.DataFrame()\n",
    "val_data = pd.DataFrame()\n",
    "test_data = pd.DataFrame()\n",
    "\n",
    "for journal in journals:\n",
    "    # Filter the dataset by journal\n",
    "    journal_data = df[df['journal'] == journal]\n",
    "    \n",
    "    # Split the data for the current journal 60% train 20% validation and 20% test\n",
    "    train, val_test = train_test_split(journal_data, train_size=train_ratio, random_state=42)\n",
    "    val, test = train_test_split(val_test, train_size=0.5, random_state=42)\n",
    "    \n",
    "    # Concatenate the data for the current journal to the overall data\n",
    "    train_data = pd.concat([train_data, train])\n",
    "    val_data = pd.concat([val_data, val])\n",
    "    test_data = pd.concat([test_data, test])\n",
    "\n",
    "# Print the number of rows for each set\n",
    "print(\"Training data:\", len(train_data))\n",
    "print(\"Validation data:\", len(val_data))\n",
    "print(\"Test data:\", len(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(train_data['journal'].nunique())\n",
    "print(val_data['journal'].nunique())\n",
    "print(test_data['journal'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Train set \\n-----------------------------------------------------')\n",
    "print(train_data['journal'].value_counts())\n",
    "print('Validation set \\n-----------------------------------------------------')\n",
    "print(val_data['journal'].value_counts())\n",
    "print('Test set \\n-----------------------------------------------------')\n",
    "print(test_data['journal'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the train, validation, and test sets to csv files\n",
    "train_data.to_csv('./Dataset/data_pubmed_train.csv', index=False)\n",
    "val_data.to_csv('./Dataset/data_pubmed_val.csv', index=False)\n",
    "test_data.to_csv('./Dataset/data_pubmed_test.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TFM",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}