{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0f3ca52d-08c9-41b7-9439-0ef80dfa2b04",
   "metadata": {},
   "source": [
    "# Generating data for embedding training materials\n",
    "\n",
    "In this notebook we will download training materials in PDF format, extract the text of every page and save the pages as PNG files. We will then use OpenAI's text embeddings and UMAP dimensionality reduction to get a simple embedding of training materials contents.\n",
    "\n",
    "```\n",
    "pip install PyPDF2 pdf2image\n",
    "conda install umap-learn openai\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d285437d-c360-4971-8367-d8dfa4626974",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "import json\n",
    "from pdf2image import convert_from_path\n",
    "import PyPDF2\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from openai import OpenAI\n",
    "import umap\n",
    "import stackview as sv\n",
    "from PIL import Image\n",
    "import tempfile"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0376e099-4f5f-4a86-bfb3-587e14b1d62d",
   "metadata": {},
   "source": [
    "We will use trainng materials about [Bio-image Data Science](https://zenodo.org/records/14030307), which is licensed [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) by Robert Haase."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3b7287dc-7f7d-42d3-b76d-903b3572ceea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'filename': '12623730_14_Summary.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/14_Summary.pdf/content'},\n",
       " {'filename': '12623730_10_function_calling.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/10_function_calling.pdf/content'},\n",
       " {'filename': '12623730_11_prompteng_rag_finetuning.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/11_prompteng_rag_finetuning.pdf/content'},\n",
       " {'filename': '12623730_12_Vision_models.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/12_Vision_models.pdf/content'},\n",
       " {'filename': '12623730_09_Deep_Learning.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/09_Deep_Learning.pdf/content'},\n",
       " {'filename': '12623730_08_Sup_Unsup_Machine_Learning.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/08_Sup_Unsup_Machine_Learning.pdf/content'},\n",
       " {'filename': '12623730_03_RSM_Image_Processing.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/03_RSM_Image_Processing.pdf/content'},\n",
       " {'filename': '12623730_01_Introduction_BIDS_2024.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/01_Introduction_BIDS_2024.pdf/content'},\n",
       " {'filename': '12623730_13_quality_assurance.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/13_quality_assurance.pdf/content'},\n",
       " {'filename': '12623730_02_Introduction_RDM_2024.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/02_Introduction_RDM_2024.pdf/content'},\n",
       " {'filename': '12623730_04_Image_segmentation.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/04_Image_segmentation.pdf/content'},\n",
       " {'filename': '12623730_05_Surface_Recon_QA.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/05_Surface_Recon_QA.pdf/content'},\n",
       " {'filename': '12623730_07_distributed_gpu_computing.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/07_distributed_gpu_computing.pdf/content'},\n",
       " {'filename': '12623730_06_Chatbots.pdf',\n",
       "  'url': 'https://zenodo.org/api/records/12623730/files/06_Chatbots.pdf/content'}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def download_pdfs_from_zenodo(record_id):\n",
    "    \"\"\"Download PDFs from Zenodo record.\"\"\"\n",
    "    base_url = f\"https://zenodo.org/api/records/{record_id}\"\n",
    "    response = requests.get(base_url)\n",
    "    data = response.json()\n",
    "    \n",
    "    if not os.path.exists('downloads'):\n",
    "        os.makedirs('downloads')\n",
    "    \n",
    "    files_info = []\n",
    "    for file in data['files']:\n",
    "        if file['key'].endswith('.pdf'):\n",
    "            download_url = file['links']['self']\n",
    "            filename = record_id + \"_\" + file['key']\n",
    "            filepath = os.path.join('downloads', filename)\n",
    "\n",
    "            if not os.path.exists(filepath):\n",
    "                # Download file\n",
    "                response = requests.get(download_url)\n",
    "                with open(filepath, 'wb') as f:\n",
    "                    f.write(response.content)\n",
    "            \n",
    "            files_info.append({'filename': filename, 'url': download_url})\n",
    "    \n",
    "    return files_info\n",
    "\n",
    "\n",
    "# Download PDFs\n",
    "files_info = download_pdfs_from_zenodo('12623730')\n",
    "files_info"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9737606c-f8f8-40c4-88f4-80acbd6609c7",
   "metadata": {},
   "source": [
    "Next we go through all pages, save them as PDF and take the text to create embedding vectors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "89890a80-5b2c-44f6-9acb-781ab82b79a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\rober\\AppData\\Local\\Temp\\ipykernel_12900\\2891063455.py:19: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.\n",
      "  return image.resize((new_width, height), Image.LANCZOS)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def resize_image(image, height):\n",
    "    \"\"\"\n",
    "    Resize the image to the specified height while maintaining aspect ratio.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    image : PIL.Image.Image\n",
    "        The image to resize.\n",
    "    height : int\n",
    "        The desired height in pixels.\n",
    "    \n",
    "    Returns\n",
    "    -------\n",
    "    PIL.Image.Image\n",
    "        The resized image.\n",
    "    \"\"\"\n",
    "    aspect_ratio = image.width / image.height\n",
    "    new_width = int(aspect_ratio * height)\n",
    "    return image.resize((new_width, height), Image.LANCZOS)\n",
    "\n",
    "def process_pdf(pdf_info):\n",
    "    \"\"\"Process PDF file to extract images and text.\"\"\"\n",
    "    filename = pdf_info['filename']\n",
    "    filepath = os.path.join('downloads', filename)\n",
    "    base_name = os.path.splitext(filename)[0]\n",
    "    \n",
    "    if not os.path.exists('downloads'):\n",
    "        os.makedirs('downloads')\n",
    "    if not os.path.exists('downloads'):\n",
    "        os.makedirs('downloads')\n",
    "\n",
    "    # Set your OpenAI API key\n",
    "    client = OpenAI()\n",
    "    \n",
    "    # Convert PDF pages to images\n",
    "    images = [resize_image(i, height=300) for i in convert_from_path(filepath)]\n",
    "    \n",
    "    # Extract text from PDF\n",
    "    pdf_reader = PyPDF2.PdfReader(filepath)\n",
    "    \n",
    "    page_data = []\n",
    "    \n",
    "    for i, image in enumerate(images):\n",
    "        # Save image\n",
    "        png_filename = f\"{base_name}_{i}.png\"\n",
    "        png_path = os.path.join('downloads', png_filename)\n",
    "        image.save(png_path)\n",
    "        \n",
    "        # Save text\n",
    "        txt_filename = f\"{base_name}_{i}.txt\"\n",
    "        txt_path = os.path.join('downloads', txt_filename)\n",
    "        text = pdf_reader.pages[i].extract_text()\n",
    "        if not os.path.exists(txt_filename):\n",
    "            with open(txt_path, 'w', encoding='utf-8') as f:\n",
    "                f.write(text)\n",
    "            \n",
    "        # Get embedding\n",
    "        response = client.embeddings.create(\n",
    "            input=text,\n",
    "            model=\"text-embedding-ada-002\"\n",
    "        )\n",
    "        embedding_vector = response.data[0].embedding\n",
    "        \n",
    "        page_data.append({\n",
    "            'filename': filename,\n",
    "            'url': pdf_info['url'],\n",
    "            'page_index': i,\n",
    "            'text': text,\n",
    "            'png_filename': png_filename,\n",
    "            'txt_filename': txt_filename,\n",
    "            'embedding_vector': embedding_vector\n",
    "        })\n",
    "     \n",
    "    return page_data\n",
    "\n",
    "# Process all PDFs\n",
    "all_page_data = []\n",
    "for pdf_info in files_info:\n",
    "    all_page_data.extend(process_pdf(pdf_info))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c363ce5e-8fa1-4d57-9e7f-fec4a3af590b",
   "metadata": {},
   "source": [
    "The result will be saved as dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fd3cd5fe-1b78-4a5c-94d8-497b3ef57f4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>url</th>\n",
       "      <th>page_index</th>\n",
       "      <th>text</th>\n",
       "      <th>png_filename</th>\n",
       "      <th>txt_filename</th>\n",
       "      <th>embedding_vector</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>0</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_0.png</td>\n",
       "      <td>12623730_14_Summary_0.txt</td>\n",
       "      <td>[-0.01753188483417034, 0.009571048431098461, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>1</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_1.png</td>\n",
       "      <td>12623730_14_Summary_1.txt</td>\n",
       "      <td>[0.001144174369983375, 0.008919398300349712, -...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>2</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_2.png</td>\n",
       "      <td>12623730_14_Summary_2.txt</td>\n",
       "      <td>[0.01131830457597971, 0.033359214663505554, 0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>3</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_3.png</td>\n",
       "      <td>12623730_14_Summary_3.txt</td>\n",
       "      <td>[0.018105685710906982, 0.026488685980439186, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>4</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_4.png</td>\n",
       "      <td>12623730_14_Summary_4.txt</td>\n",
       "      <td>[-0.027609605342149734, 0.0015738429501652718,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>858</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>29</td>\n",
       "      <td>Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_29.png</td>\n",
       "      <td>12623730_06_Chatbots_29.txt</td>\n",
       "      <td>[-0.011728818528354168, -0.0007099526119418442...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>859</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>30</td>\n",
       "      <td>Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_30.png</td>\n",
       "      <td>12623730_06_Chatbots_30.txt</td>\n",
       "      <td>[-0.007209372241050005, 0.004134070128202438, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>31</td>\n",
       "      <td>Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_31.png</td>\n",
       "      <td>12623730_06_Chatbots_31.txt</td>\n",
       "      <td>[-0.014446760527789593, 0.013194024562835693, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>32</td>\n",
       "      <td>Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_32.png</td>\n",
       "      <td>12623730_06_Chatbots_32.txt</td>\n",
       "      <td>[-0.035361308604478836, -0.001887816353701055,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>33</td>\n",
       "      <td>Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_33.png</td>\n",
       "      <td>12623730_06_Chatbots_33.txt</td>\n",
       "      <td>[-0.006428151857107878, 0.016945311799645424, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>863 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     filename  \\\n",
       "0     12623730_14_Summary.pdf   \n",
       "1     12623730_14_Summary.pdf   \n",
       "2     12623730_14_Summary.pdf   \n",
       "3     12623730_14_Summary.pdf   \n",
       "4     12623730_14_Summary.pdf   \n",
       "..                        ...   \n",
       "858  12623730_06_Chatbots.pdf   \n",
       "859  12623730_06_Chatbots.pdf   \n",
       "860  12623730_06_Chatbots.pdf   \n",
       "861  12623730_06_Chatbots.pdf   \n",
       "862  12623730_06_Chatbots.pdf   \n",
       "\n",
       "                                                   url  page_index  \\\n",
       "0    https://zenodo.org/api/records/12623730/files/...           0   \n",
       "1    https://zenodo.org/api/records/12623730/files/...           1   \n",
       "2    https://zenodo.org/api/records/12623730/files/...           2   \n",
       "3    https://zenodo.org/api/records/12623730/files/...           3   \n",
       "4    https://zenodo.org/api/records/12623730/files/...           4   \n",
       "..                                                 ...         ...   \n",
       "858  https://zenodo.org/api/records/12623730/files/...          29   \n",
       "859  https://zenodo.org/api/records/12623730/files/...          30   \n",
       "860  https://zenodo.org/api/records/12623730/files/...          31   \n",
       "861  https://zenodo.org/api/records/12623730/files/...          32   \n",
       "862  https://zenodo.org/api/records/12623730/files/...          33   \n",
       "\n",
       "                                                  text  \\\n",
       "0    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "1    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "2    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "3    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "4    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "..                                                 ...   \n",
       "858  Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "859  Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "860  Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "861  Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "862  Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "\n",
       "                    png_filename                 txt_filename  \\\n",
       "0      12623730_14_Summary_0.png    12623730_14_Summary_0.txt   \n",
       "1      12623730_14_Summary_1.png    12623730_14_Summary_1.txt   \n",
       "2      12623730_14_Summary_2.png    12623730_14_Summary_2.txt   \n",
       "3      12623730_14_Summary_3.png    12623730_14_Summary_3.txt   \n",
       "4      12623730_14_Summary_4.png    12623730_14_Summary_4.txt   \n",
       "..                           ...                          ...   \n",
       "858  12623730_06_Chatbots_29.png  12623730_06_Chatbots_29.txt   \n",
       "859  12623730_06_Chatbots_30.png  12623730_06_Chatbots_30.txt   \n",
       "860  12623730_06_Chatbots_31.png  12623730_06_Chatbots_31.txt   \n",
       "861  12623730_06_Chatbots_32.png  12623730_06_Chatbots_32.txt   \n",
       "862  12623730_06_Chatbots_33.png  12623730_06_Chatbots_33.txt   \n",
       "\n",
       "                                      embedding_vector  \n",
       "0    [-0.01753188483417034, 0.009571048431098461, 0...  \n",
       "1    [0.001144174369983375, 0.008919398300349712, -...  \n",
       "2    [0.01131830457597971, 0.033359214663505554, 0....  \n",
       "3    [0.018105685710906982, 0.026488685980439186, 0...  \n",
       "4    [-0.027609605342149734, 0.0015738429501652718,...  \n",
       "..                                                 ...  \n",
       "858  [-0.011728818528354168, -0.0007099526119418442...  \n",
       "859  [-0.007209372241050005, 0.004134070128202438, ...  \n",
       "860  [-0.014446760527789593, 0.013194024562835693, ...  \n",
       "861  [-0.035361308604478836, -0.001887816353701055,...  \n",
       "862  [-0.006428151857107878, 0.016945311799645424, ...  \n",
       "\n",
       "[863 rows x 7 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create DataFrame\n",
    "df = pd.DataFrame(all_page_data)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2da78c3a-7943-4497-849d-26df0887a156",
   "metadata": {},
   "source": [
    "We then perform dimensionality reduction on the embedding vectors and add two new columns to the dataset: UMAP0 and UMAP1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "617c5038-6f7e-4615-9008-6de4c328bd87",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\rober\\miniforge3\\envs\\devbio-napari-env\\Lib\\site-packages\\umap\\umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.\n",
      "  warn(f\"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.\")\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>url</th>\n",
       "      <th>page_index</th>\n",
       "      <th>text</th>\n",
       "      <th>png_filename</th>\n",
       "      <th>txt_filename</th>\n",
       "      <th>embedding_vector</th>\n",
       "      <th>UMAP0</th>\n",
       "      <th>UMAP1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>0</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_0.png</td>\n",
       "      <td>12623730_14_Summary_0.txt</td>\n",
       "      <td>[-0.01753188483417034, 0.009571048431098461, 0...</td>\n",
       "      <td>2.785299</td>\n",
       "      <td>5.125338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>1</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_1.png</td>\n",
       "      <td>12623730_14_Summary_1.txt</td>\n",
       "      <td>[0.001144174369983375, 0.008919398300349712, -...</td>\n",
       "      <td>1.759109</td>\n",
       "      <td>5.196022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>2</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_2.png</td>\n",
       "      <td>12623730_14_Summary_2.txt</td>\n",
       "      <td>[0.01131830457597971, 0.033359214663505554, 0....</td>\n",
       "      <td>1.605859</td>\n",
       "      <td>6.084491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>3</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_3.png</td>\n",
       "      <td>12623730_14_Summary_3.txt</td>\n",
       "      <td>[0.018105685710906982, 0.026488685980439186, 0...</td>\n",
       "      <td>1.581907</td>\n",
       "      <td>6.084695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12623730_14_Summary.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>4</td>\n",
       "      <td>Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...</td>\n",
       "      <td>12623730_14_Summary_4.png</td>\n",
       "      <td>12623730_14_Summary_4.txt</td>\n",
       "      <td>[-0.027609605342149734, 0.0015738429501652718,...</td>\n",
       "      <td>2.163119</td>\n",
       "      <td>7.161102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>858</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>29</td>\n",
       "      <td>Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_29.png</td>\n",
       "      <td>12623730_06_Chatbots_29.txt</td>\n",
       "      <td>[-0.011728818528354168, -0.0007099526119418442...</td>\n",
       "      <td>3.970344</td>\n",
       "      <td>5.525424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>859</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>30</td>\n",
       "      <td>Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_30.png</td>\n",
       "      <td>12623730_06_Chatbots_30.txt</td>\n",
       "      <td>[-0.007209372241050005, 0.004134070128202438, ...</td>\n",
       "      <td>5.693106</td>\n",
       "      <td>7.587674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>31</td>\n",
       "      <td>Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_31.png</td>\n",
       "      <td>12623730_06_Chatbots_31.txt</td>\n",
       "      <td>[-0.014446760527789593, 0.013194024562835693, ...</td>\n",
       "      <td>4.285961</td>\n",
       "      <td>5.682843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>32</td>\n",
       "      <td>Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_32.png</td>\n",
       "      <td>12623730_06_Chatbots_32.txt</td>\n",
       "      <td>[-0.035361308604478836, -0.001887816353701055,...</td>\n",
       "      <td>6.068371</td>\n",
       "      <td>4.506485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>12623730_06_Chatbots.pdf</td>\n",
       "      <td>https://zenodo.org/api/records/12623730/files/...</td>\n",
       "      <td>33</td>\n",
       "      <td>Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...</td>\n",
       "      <td>12623730_06_Chatbots_33.png</td>\n",
       "      <td>12623730_06_Chatbots_33.txt</td>\n",
       "      <td>[-0.006428151857107878, 0.016945311799645424, ...</td>\n",
       "      <td>4.330415</td>\n",
       "      <td>5.363225</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>863 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     filename  \\\n",
       "0     12623730_14_Summary.pdf   \n",
       "1     12623730_14_Summary.pdf   \n",
       "2     12623730_14_Summary.pdf   \n",
       "3     12623730_14_Summary.pdf   \n",
       "4     12623730_14_Summary.pdf   \n",
       "..                        ...   \n",
       "858  12623730_06_Chatbots.pdf   \n",
       "859  12623730_06_Chatbots.pdf   \n",
       "860  12623730_06_Chatbots.pdf   \n",
       "861  12623730_06_Chatbots.pdf   \n",
       "862  12623730_06_Chatbots.pdf   \n",
       "\n",
       "                                                   url  page_index  \\\n",
       "0    https://zenodo.org/api/records/12623730/files/...           0   \n",
       "1    https://zenodo.org/api/records/12623730/files/...           1   \n",
       "2    https://zenodo.org/api/records/12623730/files/...           2   \n",
       "3    https://zenodo.org/api/records/12623730/files/...           3   \n",
       "4    https://zenodo.org/api/records/12623730/files/...           4   \n",
       "..                                                 ...         ...   \n",
       "858  https://zenodo.org/api/records/12623730/files/...          29   \n",
       "859  https://zenodo.org/api/records/12623730/files/...          30   \n",
       "860  https://zenodo.org/api/records/12623730/files/...          31   \n",
       "861  https://zenodo.org/api/records/12623730/files/...          32   \n",
       "862  https://zenodo.org/api/records/12623730/files/...          33   \n",
       "\n",
       "                                                  text  \\\n",
       "0    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "1    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "2    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "3    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "4    Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/...   \n",
       "..                                                 ...   \n",
       "858  Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "859  Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "860  Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "861  Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "862  Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L...   \n",
       "\n",
       "                    png_filename                 txt_filename  \\\n",
       "0      12623730_14_Summary_0.png    12623730_14_Summary_0.txt   \n",
       "1      12623730_14_Summary_1.png    12623730_14_Summary_1.txt   \n",
       "2      12623730_14_Summary_2.png    12623730_14_Summary_2.txt   \n",
       "3      12623730_14_Summary_3.png    12623730_14_Summary_3.txt   \n",
       "4      12623730_14_Summary_4.png    12623730_14_Summary_4.txt   \n",
       "..                           ...                          ...   \n",
       "858  12623730_06_Chatbots_29.png  12623730_06_Chatbots_29.txt   \n",
       "859  12623730_06_Chatbots_30.png  12623730_06_Chatbots_30.txt   \n",
       "860  12623730_06_Chatbots_31.png  12623730_06_Chatbots_31.txt   \n",
       "861  12623730_06_Chatbots_32.png  12623730_06_Chatbots_32.txt   \n",
       "862  12623730_06_Chatbots_33.png  12623730_06_Chatbots_33.txt   \n",
       "\n",
       "                                      embedding_vector     UMAP0     UMAP1  \n",
       "0    [-0.01753188483417034, 0.009571048431098461, 0...  2.785299  5.125338  \n",
       "1    [0.001144174369983375, 0.008919398300349712, -...  1.759109  5.196022  \n",
       "2    [0.01131830457597971, 0.033359214663505554, 0....  1.605859  6.084491  \n",
       "3    [0.018105685710906982, 0.026488685980439186, 0...  1.581907  6.084695  \n",
       "4    [-0.027609605342149734, 0.0015738429501652718,...  2.163119  7.161102  \n",
       "..                                                 ...       ...       ...  \n",
       "858  [-0.011728818528354168, -0.0007099526119418442...  3.970344  5.525424  \n",
       "859  [-0.007209372241050005, 0.004134070128202438, ...  5.693106  7.587674  \n",
       "860  [-0.014446760527789593, 0.013194024562835693, ...  4.285961  5.682843  \n",
       "861  [-0.035361308604478836, -0.001887816353701055,...  6.068371  4.506485  \n",
       "862  [-0.006428151857107878, 0.016945311799645424, ...  4.330415  5.363225  \n",
       "\n",
       "[863 rows x 9 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert embedding vectors to numpy array for UMAP\n",
    "embeddings = np.array(df['embedding_vector'].tolist())\n",
    "\n",
    "# Apply UMAP\n",
    "reducer = umap.UMAP(n_components=2, random_state=42)\n",
    "umap_embeddings = reducer.fit_transform(embeddings)\n",
    "\n",
    "df['UMAP0'] = umap_embeddings[:, 0]\n",
    "df['UMAP1'] = umap_embeddings[:, 1]\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f12684d8-4580-4235-b07a-acb6c8718d3e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eda68fcaa68842d8b19de4245a1c8a38",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(VBox(children=(VBox(children=(HBox(children=(Label(value='Axes '), Dropdown(index=7, layout=Lay…"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import stackview\n",
    "stackview.scatterplot(df, column_x=\"UMAP0\", column_y=\"UMAP1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "dcc8cef5-6524-4074-af07-fddb9033fbc8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      False\n",
       "1      False\n",
       "2      False\n",
       "3      False\n",
       "4      False\n",
       "       ...  \n",
       "858    False\n",
       "859    False\n",
       "860    False\n",
       "861    False\n",
       "862    False\n",
       "Name: selection, Length: 863, dtype: bool"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"selection\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b678815a-f383-423e-8f4b-0f86f8992bb1",
   "metadata": {},
   "source": [
    "Finally, we save the data in two yml files: One complete set including the embedding vectors and another one with only the UMAP columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7b395f6a-514b-4756-825a-06bdf5d58937",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame saved as data.yml\n"
     ]
    }
   ],
   "source": [
    "import yaml\n",
    "\n",
    "# Convert DataFrame to dictionary\n",
    "data_dict = df.to_dict()\n",
    "\n",
    "# Save as YAML file\n",
    "with open('data.yml', 'w') as file:\n",
    "    yaml.dump(data_dict, file)\n",
    "\n",
    "print(\"DataFrame saved as data.yml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6653ecec-524b-4a94-8f92-3126a5e91e9e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame saved as data.yml\n"
     ]
    }
   ],
   "source": [
    "import yaml\n",
    "\n",
    "# Convert DataFrame to dictionary\n",
    "data_dict = df[[\"filename\", \"png_filename\", \"text\", \"url\", \"page_index\", \"UMAP0\", \"UMAP1\"]].to_dict()\n",
    "\n",
    "# Save as YAML file\n",
    "with open('data_png_umap.yml', 'w') as file:\n",
    "    yaml.dump(data_dict, file)\n",
    "\n",
    "print(\"DataFrame saved as data.yml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe545005-5a2a-47fd-b021-79c2ae6fcedd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read YAML file\n",
    "with open('data.yml', 'r') as file:\n",
    "    loaded_dict = yaml.safe_load(file)\n",
    "\n",
    "# Convert dictionary back to DataFrame \n",
    "loaded_df = pd.DataFrame(loaded_dict)\n",
    "\n",
    "# Show first few rows of the loaded DataFrame\n",
    "loaded_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "699e33da-cc31-441d-887d-7e7fd0c02db5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}