{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge Found Data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/Found/train/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(f'./Dataset/Found/Complete/'):\n",
    "    os.makedirs(f'./Dataset/Found/Complete/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_train_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_train_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['journal'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/Found/validate/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_validation_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_validation_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['journal'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/Found/test/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_test_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_test_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.unique(df['journal']).shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge Not Found Data set"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/NotFound/train/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(f'./Dataset/NotFound/Complete/'):\n",
    "    os.makedirs(f'./Dataset/NotFound/Complete/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_train_not_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_train_not_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/NotFound/validate/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_validation_not_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_validation_not_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/NotFound/test/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_test_not_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_test_not_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge all"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merge Found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/Found/Complete/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merge NotFound"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. defines path to csv files\n",
    "path = \"./Dataset/NotFound/Complete/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. creates list with files to merge based on name convention\n",
    "file_list = [path + f for f in os.listdir(path)]\n",
    "file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. creates empty list to include the content of each file converted to pandas DF\n",
    "csv_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n",
    "for file in sorted(file_list):\n",
    "    csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. merges single pandas DFs into a single DF, index is refreshed \n",
    "csv_merged = pd.concat(csv_list, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Single DF is saved to the path in CSV format, without index column\n",
    "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_not_found.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_not_found.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "TFM-LuckyLook",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}