{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Analysis" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PubMed all" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Pubmed Data without filtering, contains all articles\n", "x = pd.read_csv('./Dataset/data_pubmed_all.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['pubmed_id', 'title', 'keywords', 'journal', 'abstract', 'conclusions',\n", " 'methods', 'results', 'copyrights', 'doi', 'publication_date',\n", " 'authors', 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'File_Name'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dataframe\n", "x.columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8335" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of unique journals\n", "x['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "830978" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of articles\n", "x.shape[0]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Count the number of articles per journal\n", "article_count_per_journal = x['journal'].value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 8335.000000\n", "mean 99.697421\n", "std 416.964437\n", "min 1.000000\n", "25% 2.000000\n", "50% 10.000000\n", "75% 49.000000\n", "max 17236.000000\n", "Name: journal, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "article_count_per_journal.describe()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Your existing histogram data preparation code\n", "hist, bin_edges = np.histogram(article_count_per_journal, bins=10)\n", "first_bin_range = (bin_edges[0], bin_edges[1])\n", "articles_in_first_bin = article_count_per_journal[\n", " (article_count_per_journal >= first_bin_range[0]) & (article_count_per_journal <= first_bin_range[1])\n", "]\n", "\n", "# Set the global font size for all plots (optional)\n", "plt.rcParams.update({'font.size': 18}) # You can adjust the size as needed\n", "\n", "# Plotting\n", "plt.figure(figsize=(10, 6))\n", "plt.hist(articles_in_first_bin, bins=10, edgecolor='black', log=True)\n", "\n", "# Increase font size for title and labels\n", "plt.title('Log-Scaled Histogram of Articles per Journal in 10 bins', fontsize=16)\n", "plt.xlabel('Number of Articles', fontsize=16)\n", "plt.ylabel('Number of Journals (Log Scale)', fontsize=16)\n", "\n", "# Increase font size for tick labels\n", "plt.xticks(fontsize=14)\n", "plt.yticks(fontsize=14)\n", "\n", "plt.savefig(f'Histogram of Articles per Journal.pdf', format='pdf')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# Your existing code for computing the histogram data\n", "hist, bin_edges = np.histogram(articles_in_first_bin, bins=10)\n", "second_bin_range = (bin_edges[0], bin_edges[1])\n", "articles_in_second_bin = article_count_per_journal[\n", " (article_count_per_journal >= second_bin_range[0]) & (article_count_per_journal <= second_bin_range[1])\n", "]\n", "\n", "# Set the global font size for all plots (optional)\n", "plt.rcParams.update({'font.size': 14}) # You can adjust the size as needed\n", "\n", "# Plotting\n", "plt.figure(figsize=(10, 6))\n", "plt.hist(articles_in_second_bin, bins=10, edgecolor='black')\n", "\n", "# Increase font size for title and labels\n", "plt.title('Histogram of Articles per Journal in the First Bin Range', fontsize=16)\n", "plt.xlabel('Number of Articles', fontsize=14)\n", "plt.ylabel('Number of Journals', fontsize=14)\n", "\n", "# Increase font size for tick labels\n", "plt.xticks(fontsize=12)\n", "plt.yticks(fontsize=12)\n", "\n", "plt.savefig(f'Histogram of Articles per Journal2.pdf', format='pdf')\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PubMed filtered" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Filtered PubMed Data, contains \n", "x = pd.read_csv('./Dataset/data_pubmed.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['publication_date', 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'journal'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dataframe\n", "x.columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "469" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of unique journals\n", "x['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "262870" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of articles\n", "x.shape[0]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained('allenai/cs_roberta_base', do_lower_case=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import data_loader.CustomTokenize as ct" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "docs = ct.process_samples2(x)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "docs = pd.DataFrame(docs, columns=['Input'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Laser Application to the Root Surface Increases the Bonding Strength of Surface-Treated Prefabricated Glass-Fiber Posts in Teeth with Excessive Substance Loss\\nThis study examined the effect of roughening of the root surface using an erbium-doped yttrium aluminum garnet (Er: YAG) laser on the binding strength of teeth undergoing root canal treatment. Ninety single-rooted teeth were used and assigned randomly to 9 groups (n=10 each). Root canals were prepared using the FlexMaster rotary system. An Er: YAG laser was applied to the root canals in Group 1, with no surface treatment of the glass-fiber post. In Group 2, aluminum oxide particles were applied. In Group 3, the laser was applied to the root canals, with Cojet treatment. Group 4 received laser treatment and Clearfil Ceramic Primer. In Group 5, Clearfil Ceramic Primer silane coupling was performed on post surfaces without laser treatment. In Group 6, hydrofluoric acid (HF) application was followed by Clearfil Ceramic Primer cementing of the glass post surfaces with laser application. In Group 7, HF acid treatment was performed without laser. In Group 8, the laser was applied, followed by sanding of post surfaces using Korox 50, and silane coupling with Clearfil Ceramic Primer. In Group 9, the post surfaces were sanded using Korox 50 with laser application to the root canals. The samples were subjected to a push-out experiment. The data were analyzed using Friedman’s test and the Wilcoxon signed-rank test. A significant difference in bonding strength was found among the groups (p<0.005). Use of an Er: YAG laser in the root canal may be beneficial prior to bonding of glass posts.\\nLasers, Solid-State, Photoacoustic Techniques, Post and Core Technique, Smear Layer'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs['Input'][0]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Minimum Token Length: 21\n", "Mean Token Length: 399.9529349107924\n", "Maximum Token Length: 3448\n" ] } ], "source": [ "# Define a function to calculate the token length\n", "def get_token_length(text):\n", " tokens = tokenizer.tokenize(text)\n", " return len(tokens)\n", "\n", "# Apply the function to a DataFrame column and create a new column for token lengths\n", "docs['token_length'] = docs['Input'].apply(get_token_length)\n", "\n", "# Calculate min, mean, and max token lengths\n", "min_token_length = docs['token_length'].min()\n", "mean_token_length = docs['token_length'].mean()\n", "max_token_length = docs['token_length'].max()\n", "\n", "# Print the results\n", "print(f\"Minimum Token Length: {min_token_length}\")\n", "print(f\"Mean Token Length: {mean_token_length}\")\n", "print(f\"Maximum Token Length: {max_token_length}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create bins for token length ranges\n", "bins = [0, 256, 512, 768, float('inf')] # You can adjust these bins as needed\n", "\n", "# Use pd.cut to categorize token lengths into bins\n", "category_labels = ['<256', '256-512', '512-768', '>768']\n", "docs['token_length_category'] = pd.cut(docs['token_length'], bins=bins, labels=category_labels)\n", "\n", "# Calculate the count of items in each category\n", "category_counts = docs['token_length_category'].value_counts().reindex(category_labels, fill_value=0)\n", "\n", "# Set the global font size for all plots (optional)\n", "plt.rcParams.update({'font.size': 14}) # Adjust the size as needed\n", "\n", "# Create a bar plot\n", "category_counts.plot(kind='bar', rot=0)\n", "\n", "# Increase font size for title and labels\n", "plt.title('Number of Articles by Token Length Range', fontsize=16)\n", "plt.xlabel('Token Length Range', fontsize=14)\n", "plt.ylabel('Number of Articles', fontsize=14)\n", "\n", "# Increase font size for tick labels\n", "plt.xticks(fontsize=12)\n", "plt.yticks(fontsize=12)\n", "\n", "plt.savefig(f'Number of Articles by Token Length Range.pdf', format='pdf')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PubMed Filtered Train" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Filtered PubMed Data, contains \n", "x = pd.read_csv('./Dataset/data_pubmed_train.csv')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['publication_date', 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'journal'],\n", " dtype='object')" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dataframe\n", "x.columns" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "469" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of unique journals\n", "x['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "157540" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of articles\n", "x.shape[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PubMed Filtered Validation" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Filtered PubMed Data, contains \n", "x = pd.read_csv('./Dataset/data_pubmed_val.csv')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['publication_date', 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'journal'],\n", " dtype='object')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dataframe\n", "x.columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "469" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of unique journals\n", "x['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "52571" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of articles\n", "x.shape[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PubMed filtered Test" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# Filtered PubMed Data, contains \n", "x = pd.read_csv('./Dataset/data_pubmed_test.csv')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['publication_date', 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'journal'],\n", " dtype='object')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dataframe\n", "x.columns" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "469" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of unique journals\n", "x['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "52759" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of articles\n", "x.shape[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract Labels" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract labels to use later." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "x = pd.read_csv('./Dataset/data_pubmed_test.csv')\n", "label_encoder = LabelEncoder()\n", "label_encoder = label_encoder.fit(x['journal'])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['label_encoder.pkl']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "\n", "# Save the label_encoder object\n", "joblib.dump(label_encoder, 'label_encoder.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random Paper" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "x = pd.read_csv('./Dataset/data_pubmed_test.csv')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "a = x[x['AKE_pubmed_title'] == 'Environment-dependent attack rates of cryptic and aposematic butterflies']" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "a.to_csv('RandomArticle.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "unique_journals = set(x['journal'])\n", "\n", "# Convert the set back to a list if needed\n", "unique_journals_list = list(unique_journals)\n", "\n", "# Save the unique journal names to a file\n", "with open('unique_journals.txt', 'w') as file:\n", " for journal in unique_journals_list:\n", " file.write(journal + '\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random Journal" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import random" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "x = pd.read_csv('./Dataset/data_pubmed_test.csv')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Neural regeneration research\n" ] } ], "source": [ "# Get the list of journal entries\n", "journal_entries = x['journal']\n", "\n", "# Choose a random index\n", "random_index = random.randint(0, len(journal_entries) - 1)\n", "\n", "# Retrieve the random journal entry\n", "random_journal = journal_entries[random_index]\n", "\n", "# Print the random journal entry\n", "print(random_journal)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Search in https://pubmed.ncbi.nlm.nih.gov/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "TFM-LuckyLook", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }