{ "cells": [ { "cell_type": "markdown", "id": "0f3ca52d-08c9-41b7-9439-0ef80dfa2b04", "metadata": {}, "source": [ "# Generating data for embedding training materials\n", "\n", "In this notebook we will download training materials in PDF format, extract the text of every page and save the pages as PNG files. We will then use OpenAI's text embeddings and UMAP dimensionality reduction to get a simple embedding of training materials contents.\n", "\n", "```\n", "pip install PyPDF2 pdf2image\n", "conda install umap-learn openai\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "id": "d285437d-c360-4971-8367-d8dfa4626974", "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "import json\n", "from pdf2image import convert_from_path\n", "import PyPDF2\n", "import numpy as np\n", "import pandas as pd\n", "from openai import OpenAI\n", "import umap\n", "import stackview as sv\n", "from PIL import Image\n", "import tempfile" ] }, { "cell_type": "markdown", "id": "0376e099-4f5f-4a86-bfb3-587e14b1d62d", "metadata": {}, "source": [ "We will use trainng materials about [Bio-image Data Science](https://zenodo.org/records/14030307), which is licensed [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) by Robert Haase." ] }, { "cell_type": "code", "execution_count": 2, "id": "3b7287dc-7f7d-42d3-b76d-903b3572ceea", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'filename': '12623730_14_Summary.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/14_Summary.pdf/content'},\n", " {'filename': '12623730_10_function_calling.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/10_function_calling.pdf/content'},\n", " {'filename': '12623730_11_prompteng_rag_finetuning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/11_prompteng_rag_finetuning.pdf/content'},\n", " {'filename': '12623730_12_Vision_models.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/12_Vision_models.pdf/content'},\n", " {'filename': '12623730_09_Deep_Learning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/09_Deep_Learning.pdf/content'},\n", " {'filename': '12623730_08_Sup_Unsup_Machine_Learning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/08_Sup_Unsup_Machine_Learning.pdf/content'},\n", " {'filename': '12623730_03_RSM_Image_Processing.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/03_RSM_Image_Processing.pdf/content'},\n", " {'filename': '12623730_01_Introduction_BIDS_2024.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/01_Introduction_BIDS_2024.pdf/content'},\n", " {'filename': '12623730_13_quality_assurance.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/13_quality_assurance.pdf/content'},\n", " {'filename': '12623730_02_Introduction_RDM_2024.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/02_Introduction_RDM_2024.pdf/content'},\n", " {'filename': '12623730_04_Image_segmentation.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/04_Image_segmentation.pdf/content'},\n", " {'filename': '12623730_05_Surface_Recon_QA.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/05_Surface_Recon_QA.pdf/content'},\n", " {'filename': '12623730_07_distributed_gpu_computing.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/07_distributed_gpu_computing.pdf/content'},\n", " {'filename': '12623730_06_Chatbots.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/06_Chatbots.pdf/content'}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def download_pdfs_from_zenodo(record_id):\n", " \"\"\"Download PDFs from Zenodo record.\"\"\"\n", " base_url = f\"https://zenodo.org/api/records/{record_id}\"\n", " response = requests.get(base_url)\n", " data = response.json()\n", " \n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", " \n", " files_info = []\n", " for file in data['files']:\n", " if file['key'].endswith('.pdf'):\n", " download_url = file['links']['self']\n", " filename = record_id + \"_\" + file['key']\n", " filepath = os.path.join('downloads', filename)\n", "\n", " if not os.path.exists(filepath):\n", " # Download file\n", " response = requests.get(download_url)\n", " with open(filepath, 'wb') as f:\n", " f.write(response.content)\n", " \n", " files_info.append({'filename': filename, 'url': download_url})\n", " \n", " return files_info\n", "\n", "\n", "# Download PDFs\n", "files_info = download_pdfs_from_zenodo('12623730')\n", "files_info" ] }, { "cell_type": "markdown", "id": "9737606c-f8f8-40c4-88f4-80acbd6609c7", "metadata": {}, "source": [ "Next we go through all pages, save them as PDF and take the text to create embedding vectors." ] }, { "cell_type": "code", "execution_count": 3, "id": "89890a80-5b2c-44f6-9acb-781ab82b79a4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\rober\\AppData\\Local\\Temp\\ipykernel_12900\\2891063455.py:19: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.\n", " return image.resize((new_width, height), Image.LANCZOS)\n" ] } ], "source": [ "\n", "def resize_image(image, height):\n", " \"\"\"\n", " Resize the image to the specified height while maintaining aspect ratio.\n", "\n", " Parameters\n", " ----------\n", " image : PIL.Image.Image\n", " The image to resize.\n", " height : int\n", " The desired height in pixels.\n", " \n", " Returns\n", " -------\n", " PIL.Image.Image\n", " The resized image.\n", " \"\"\"\n", " aspect_ratio = image.width / image.height\n", " new_width = int(aspect_ratio * height)\n", " return image.resize((new_width, height), Image.LANCZOS)\n", "\n", "def process_pdf(pdf_info):\n", " \"\"\"Process PDF file to extract images and text.\"\"\"\n", " filename = pdf_info['filename']\n", " filepath = os.path.join('downloads', filename)\n", " base_name = os.path.splitext(filename)[0]\n", " \n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", "\n", " # Set your OpenAI API key\n", " client = OpenAI()\n", " \n", " # Convert PDF pages to images\n", " images = [resize_image(i, height=300) for i in convert_from_path(filepath)]\n", " \n", " # Extract text from PDF\n", " pdf_reader = PyPDF2.PdfReader(filepath)\n", " \n", " page_data = []\n", " \n", " for i, image in enumerate(images):\n", " # Save image\n", " png_filename = f\"{base_name}_{i}.png\"\n", " png_path = os.path.join('downloads', png_filename)\n", " image.save(png_path)\n", " \n", " # Save text\n", " txt_filename = f\"{base_name}_{i}.txt\"\n", " txt_path = os.path.join('downloads', txt_filename)\n", " text = pdf_reader.pages[i].extract_text()\n", " if not os.path.exists(txt_filename):\n", " with open(txt_path, 'w', encoding='utf-8') as f:\n", " f.write(text)\n", " \n", " # Get embedding\n", " response = client.embeddings.create(\n", " input=text,\n", " model=\"text-embedding-ada-002\"\n", " )\n", " embedding_vector = response.data[0].embedding\n", " \n", " page_data.append({\n", " 'filename': filename,\n", " 'url': pdf_info['url'],\n", " 'page_index': i,\n", " 'text': text,\n", " 'png_filename': png_filename,\n", " 'txt_filename': txt_filename,\n", " 'embedding_vector': embedding_vector\n", " })\n", " \n", " return page_data\n", "\n", "# Process all PDFs\n", "all_page_data = []\n", "for pdf_info in files_info:\n", " all_page_data.extend(process_pdf(pdf_info))" ] }, { "cell_type": "markdown", "id": "c363ce5e-8fa1-4d57-9e7f-fec4a3af590b", "metadata": {}, "source": [ "The result will be saved as dataframe." ] }, { "cell_type": "code", "execution_count": 4, "id": "fd3cd5fe-1b78-4a5c-94d8-497b3ef57f4a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenameurlpage_indextextpng_filenametxt_filenameembedding_vector
012623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...0Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_0.png12623730_14_Summary_0.txt[-0.01753188483417034, 0.009571048431098461, 0...
112623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...1Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_1.png12623730_14_Summary_1.txt[0.001144174369983375, 0.008919398300349712, -...
212623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...2Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_2.png12623730_14_Summary_2.txt[0.01131830457597971, 0.033359214663505554, 0....
312623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...3Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_3.png12623730_14_Summary_3.txt[0.018105685710906982, 0.026488685980439186, 0...
412623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...4Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_4.png12623730_14_Summary_4.txt[-0.027609605342149734, 0.0015738429501652718,...
........................
85812623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...29Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_29.png12623730_06_Chatbots_29.txt[-0.011728818528354168, -0.0007099526119418442...
85912623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...30Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_30.png12623730_06_Chatbots_30.txt[-0.007209372241050005, 0.004134070128202438, ...
86012623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...31Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_31.png12623730_06_Chatbots_31.txt[-0.014446760527789593, 0.013194024562835693, ...
86112623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...32Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_32.png12623730_06_Chatbots_32.txt[-0.035361308604478836, -0.001887816353701055,...
86212623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...33Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_33.png12623730_06_Chatbots_33.txt[-0.006428151857107878, 0.016945311799645424, ...
\n", "

863 rows × 7 columns

\n", "
" ], "text/plain": [ " filename \\\n", "0 12623730_14_Summary.pdf \n", "1 12623730_14_Summary.pdf \n", "2 12623730_14_Summary.pdf \n", "3 12623730_14_Summary.pdf \n", "4 12623730_14_Summary.pdf \n", ".. ... \n", "858 12623730_06_Chatbots.pdf \n", "859 12623730_06_Chatbots.pdf \n", "860 12623730_06_Chatbots.pdf \n", "861 12623730_06_Chatbots.pdf \n", "862 12623730_06_Chatbots.pdf \n", "\n", " url page_index \\\n", "0 https://zenodo.org/api/records/12623730/files/... 0 \n", "1 https://zenodo.org/api/records/12623730/files/... 1 \n", "2 https://zenodo.org/api/records/12623730/files/... 2 \n", "3 https://zenodo.org/api/records/12623730/files/... 3 \n", "4 https://zenodo.org/api/records/12623730/files/... 4 \n", ".. ... ... \n", "858 https://zenodo.org/api/records/12623730/files/... 29 \n", "859 https://zenodo.org/api/records/12623730/files/... 30 \n", "860 https://zenodo.org/api/records/12623730/files/... 31 \n", "861 https://zenodo.org/api/records/12623730/files/... 32 \n", "862 https://zenodo.org/api/records/12623730/files/... 33 \n", "\n", " text \\\n", "0 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "1 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "2 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "3 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "4 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", ".. ... \n", "858 Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "859 Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "860 Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "861 Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "862 Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "\n", " png_filename txt_filename \\\n", "0 12623730_14_Summary_0.png 12623730_14_Summary_0.txt \n", "1 12623730_14_Summary_1.png 12623730_14_Summary_1.txt \n", "2 12623730_14_Summary_2.png 12623730_14_Summary_2.txt \n", "3 12623730_14_Summary_3.png 12623730_14_Summary_3.txt \n", "4 12623730_14_Summary_4.png 12623730_14_Summary_4.txt \n", ".. ... ... \n", "858 12623730_06_Chatbots_29.png 12623730_06_Chatbots_29.txt \n", "859 12623730_06_Chatbots_30.png 12623730_06_Chatbots_30.txt \n", "860 12623730_06_Chatbots_31.png 12623730_06_Chatbots_31.txt \n", "861 12623730_06_Chatbots_32.png 12623730_06_Chatbots_32.txt \n", "862 12623730_06_Chatbots_33.png 12623730_06_Chatbots_33.txt \n", "\n", " embedding_vector \n", "0 [-0.01753188483417034, 0.009571048431098461, 0... \n", "1 [0.001144174369983375, 0.008919398300349712, -... \n", "2 [0.01131830457597971, 0.033359214663505554, 0.... \n", "3 [0.018105685710906982, 0.026488685980439186, 0... \n", "4 [-0.027609605342149734, 0.0015738429501652718,... \n", ".. ... \n", "858 [-0.011728818528354168, -0.0007099526119418442... \n", "859 [-0.007209372241050005, 0.004134070128202438, ... \n", "860 [-0.014446760527789593, 0.013194024562835693, ... \n", "861 [-0.035361308604478836, -0.001887816353701055,... \n", "862 [-0.006428151857107878, 0.016945311799645424, ... \n", "\n", "[863 rows x 7 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create DataFrame\n", "df = pd.DataFrame(all_page_data)\n", "df" ] }, { "cell_type": "markdown", "id": "2da78c3a-7943-4497-849d-26df0887a156", "metadata": {}, "source": [ "We then perform dimensionality reduction on the embedding vectors and add two new columns to the dataset: UMAP0 and UMAP1." ] }, { "cell_type": "code", "execution_count": 5, "id": "617c5038-6f7e-4615-9008-6de4c328bd87", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\rober\\miniforge3\\envs\\devbio-napari-env\\Lib\\site-packages\\umap\\umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n", " warn(f\"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.\")\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenameurlpage_indextextpng_filenametxt_filenameembedding_vectorUMAP0UMAP1
012623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...0Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_0.png12623730_14_Summary_0.txt[-0.01753188483417034, 0.009571048431098461, 0...2.7852995.125338
112623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...1Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_1.png12623730_14_Summary_1.txt[0.001144174369983375, 0.008919398300349712, -...1.7591095.196022
212623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...2Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_2.png12623730_14_Summary_2.txt[0.01131830457597971, 0.033359214663505554, 0....1.6058596.084491
312623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...3Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_3.png12623730_14_Summary_3.txt[0.018105685710906982, 0.026488685980439186, 0...1.5819076.084695
412623730_14_Summary.pdfhttps://zenodo.org/api/records/12623730/files/...4Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...12623730_14_Summary_4.png12623730_14_Summary_4.txt[-0.027609605342149734, 0.0015738429501652718,...2.1631197.161102
..............................
85812623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...29Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_29.png12623730_06_Chatbots_29.txt[-0.011728818528354168, -0.0007099526119418442...3.9703445.525424
85912623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...30Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_30.png12623730_06_Chatbots_30.txt[-0.007209372241050005, 0.004134070128202438, ...5.6931067.587674
86012623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...31Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_31.png12623730_06_Chatbots_31.txt[-0.014446760527789593, 0.013194024562835693, ...4.2859615.682843
86112623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...32Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_32.png12623730_06_Chatbots_32.txt[-0.035361308604478836, -0.001887816353701055,...6.0683714.506485
86212623730_06_Chatbots.pdfhttps://zenodo.org/api/records/12623730/files/...33Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...12623730_06_Chatbots_33.png12623730_06_Chatbots_33.txt[-0.006428151857107878, 0.016945311799645424, ...4.3304155.363225
\n", "

863 rows × 9 columns

\n", "
" ], "text/plain": [ " filename \\\n", "0 12623730_14_Summary.pdf \n", "1 12623730_14_Summary.pdf \n", "2 12623730_14_Summary.pdf \n", "3 12623730_14_Summary.pdf \n", "4 12623730_14_Summary.pdf \n", ".. ... \n", "858 12623730_06_Chatbots.pdf \n", "859 12623730_06_Chatbots.pdf \n", "860 12623730_06_Chatbots.pdf \n", "861 12623730_06_Chatbots.pdf \n", "862 12623730_06_Chatbots.pdf \n", "\n", " url page_index \\\n", "0 https://zenodo.org/api/records/12623730/files/... 0 \n", "1 https://zenodo.org/api/records/12623730/files/... 1 \n", "2 https://zenodo.org/api/records/12623730/files/... 2 \n", "3 https://zenodo.org/api/records/12623730/files/... 3 \n", "4 https://zenodo.org/api/records/12623730/files/... 4 \n", ".. ... ... \n", "858 https://zenodo.org/api/records/12623730/files/... 29 \n", "859 https://zenodo.org/api/records/12623730/files/... 30 \n", "860 https://zenodo.org/api/records/12623730/files/... 31 \n", "861 https://zenodo.org/api/records/12623730/files/... 32 \n", "862 https://zenodo.org/api/records/12623730/files/... 33 \n", "\n", " text \\\n", "0 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "1 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "2 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "3 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", "4 Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... \n", ".. ... \n", "858 Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "859 Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "860 Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "861 Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "862 Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... \n", "\n", " png_filename txt_filename \\\n", "0 12623730_14_Summary_0.png 12623730_14_Summary_0.txt \n", "1 12623730_14_Summary_1.png 12623730_14_Summary_1.txt \n", "2 12623730_14_Summary_2.png 12623730_14_Summary_2.txt \n", "3 12623730_14_Summary_3.png 12623730_14_Summary_3.txt \n", "4 12623730_14_Summary_4.png 12623730_14_Summary_4.txt \n", ".. ... ... \n", "858 12623730_06_Chatbots_29.png 12623730_06_Chatbots_29.txt \n", "859 12623730_06_Chatbots_30.png 12623730_06_Chatbots_30.txt \n", "860 12623730_06_Chatbots_31.png 12623730_06_Chatbots_31.txt \n", "861 12623730_06_Chatbots_32.png 12623730_06_Chatbots_32.txt \n", "862 12623730_06_Chatbots_33.png 12623730_06_Chatbots_33.txt \n", "\n", " embedding_vector UMAP0 UMAP1 \n", "0 [-0.01753188483417034, 0.009571048431098461, 0... 2.785299 5.125338 \n", "1 [0.001144174369983375, 0.008919398300349712, -... 1.759109 5.196022 \n", "2 [0.01131830457597971, 0.033359214663505554, 0.... 1.605859 6.084491 \n", "3 [0.018105685710906982, 0.026488685980439186, 0... 1.581907 6.084695 \n", "4 [-0.027609605342149734, 0.0015738429501652718,... 2.163119 7.161102 \n", ".. ... ... ... \n", "858 [-0.011728818528354168, -0.0007099526119418442... 3.970344 5.525424 \n", "859 [-0.007209372241050005, 0.004134070128202438, ... 5.693106 7.587674 \n", "860 [-0.014446760527789593, 0.013194024562835693, ... 4.285961 5.682843 \n", "861 [-0.035361308604478836, -0.001887816353701055,... 6.068371 4.506485 \n", "862 [-0.006428151857107878, 0.016945311799645424, ... 4.330415 5.363225 \n", "\n", "[863 rows x 9 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert embedding vectors to numpy array for UMAP\n", "embeddings = np.array(df['embedding_vector'].tolist())\n", "\n", "# Apply UMAP\n", "reducer = umap.UMAP(n_components=2, random_state=42)\n", "umap_embeddings = reducer.fit_transform(embeddings)\n", "\n", "df['UMAP0'] = umap_embeddings[:, 0]\n", "df['UMAP1'] = umap_embeddings[:, 1]\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 6, "id": "f12684d8-4580-4235-b07a-acb6c8718d3e", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eda68fcaa68842d8b19de4245a1c8a38", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(VBox(children=(VBox(children=(HBox(children=(Label(value='Axes '), Dropdown(index=7, layout=Lay…" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import stackview\n", "stackview.scatterplot(df, column_x=\"UMAP0\", column_y=\"UMAP1\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "dcc8cef5-6524-4074-af07-fddb9033fbc8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", " ... \n", "858 False\n", "859 False\n", "860 False\n", "861 False\n", "862 False\n", "Name: selection, Length: 863, dtype: bool" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"selection\"]" ] }, { "cell_type": "markdown", "id": "b678815a-f383-423e-8f4b-0f86f8992bb1", "metadata": {}, "source": [ "Finally, we save the data in two yml files: One complete set including the embedding vectors and another one with only the UMAP columns." ] }, { "cell_type": "code", "execution_count": 9, "id": "7b395f6a-514b-4756-825a-06bdf5d58937", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DataFrame saved as data.yml\n" ] } ], "source": [ "import yaml\n", "\n", "# Convert DataFrame to dictionary\n", "data_dict = df.to_dict()\n", "\n", "# Save as YAML file\n", "with open('data.yml', 'w') as file:\n", " yaml.dump(data_dict, file)\n", "\n", "print(\"DataFrame saved as data.yml\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "6653ecec-524b-4a94-8f92-3126a5e91e9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DataFrame saved as data.yml\n" ] } ], "source": [ "import yaml\n", "\n", "# Convert DataFrame to dictionary\n", "data_dict = df[[\"filename\", \"png_filename\", \"text\", \"url\", \"page_index\", \"UMAP0\", \"UMAP1\"]].to_dict()\n", "\n", "# Save as YAML file\n", "with open('data_png_umap.yml', 'w') as file:\n", " yaml.dump(data_dict, file)\n", "\n", "print(\"DataFrame saved as data.yml\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fe545005-5a2a-47fd-b021-79c2ae6fcedd", "metadata": {}, "outputs": [], "source": [ "# Read YAML file\n", "with open('data.yml', 'r') as file:\n", " loaded_dict = yaml.safe_load(file)\n", "\n", "# Convert dictionary back to DataFrame \n", "loaded_df = pd.DataFrame(loaded_dict)\n", "\n", "# Show first few rows of the loaded DataFrame\n", "loaded_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "699e33da-cc31-441d-887d-7e7fd0c02db5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }