{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Merge Found Data set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/Found/train/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(f'./Dataset/Found/Complete/'):\n", " os.makedirs(f'./Dataset/Found/Complete/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_train_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_train_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['journal'].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/Found/validate/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_validation_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_validation_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['journal'].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/Found/test/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_test_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_test_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.unique(df['journal']).shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Merge Not Found Data set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/NotFound/train/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(f'./Dataset/NotFound/Complete/'):\n", " os.makedirs(f'./Dataset/NotFound/Complete/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_train_not_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_train_not_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/NotFound/validate/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_validation_not_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_validation_not_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/NotFound/test/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_test_not_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_test_not_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Merge all" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge Found" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/Found/Complete/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/Found/Complete/' + 'data_pubmed_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge NotFound" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. defines path to csv files\n", "path = \"./Dataset/NotFound/Complete/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. creates list with files to merge based on name convention\n", "file_list = [path + f for f in os.listdir(path)]\n", "file_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. creates empty list to include the content of each file converted to pandas DF\n", "csv_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. reads each (sorted) file in file_list, converts it to pandas DF and appends it to the csv_list\n", "for file in sorted(file_list):\n", " csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. merges single pandas DFs into a single DF, index is refreshed \n", "csv_merged = pd.concat(csv_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Single DF is saved to the path in CSV format, without index column\n", "csv_merged.to_csv('./Dataset/NotFound/Complete/' + 'data_pubmed_not_found.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_not_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] } ], "metadata": { "kernelspec": { "display_name": "TFM-LuckyLook", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }