{ "cells": [ { "cell_type": "markdown", "id": "0f3ca52d-08c9-41b7-9439-0ef80dfa2b04", "metadata": {}, "source": [ "# Generating data for embedding training materials\n", "\n", "In this notebook we will download training materials in PDF format, extract the text of every page and save the pages as PNG files. We will then use OpenAI's text embeddings and UMAP dimensionality reduction to get a simple embedding of training materials contents.\n", "\n", "```\n", "pip install PyPDF2 pdf2image\n", "conda install umap-learn openai\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "id": "d285437d-c360-4971-8367-d8dfa4626974", "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "import json\n", "from pdf2image import convert_from_path\n", "import PyPDF2\n", "import numpy as np\n", "import pandas as pd\n", "from openai import OpenAI\n", "import umap\n", "import stackview as sv\n", "from PIL import Image\n", "import tempfile" ] }, { "cell_type": "markdown", "id": "0376e099-4f5f-4a86-bfb3-587e14b1d62d", "metadata": {}, "source": [ "We will use trainng materials about [Bio-image Data Science](https://zenodo.org/records/14030307), which is licensed [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) by Robert Haase." ] }, { "cell_type": "code", "execution_count": 2, "id": "3b7287dc-7f7d-42d3-b76d-903b3572ceea", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'filename': '12623730_14_Summary.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/14_Summary.pdf/content'},\n", " {'filename': '12623730_10_function_calling.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/10_function_calling.pdf/content'},\n", " {'filename': '12623730_11_prompteng_rag_finetuning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/11_prompteng_rag_finetuning.pdf/content'},\n", " {'filename': '12623730_12_Vision_models.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/12_Vision_models.pdf/content'},\n", " {'filename': '12623730_09_Deep_Learning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/09_Deep_Learning.pdf/content'},\n", " {'filename': '12623730_08_Sup_Unsup_Machine_Learning.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/08_Sup_Unsup_Machine_Learning.pdf/content'},\n", " {'filename': '12623730_03_RSM_Image_Processing.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/03_RSM_Image_Processing.pdf/content'},\n", " {'filename': '12623730_01_Introduction_BIDS_2024.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/01_Introduction_BIDS_2024.pdf/content'},\n", " {'filename': '12623730_13_quality_assurance.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/13_quality_assurance.pdf/content'},\n", " {'filename': '12623730_02_Introduction_RDM_2024.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/02_Introduction_RDM_2024.pdf/content'},\n", " {'filename': '12623730_04_Image_segmentation.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/04_Image_segmentation.pdf/content'},\n", " {'filename': '12623730_05_Surface_Recon_QA.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/05_Surface_Recon_QA.pdf/content'},\n", " {'filename': '12623730_07_distributed_gpu_computing.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/07_distributed_gpu_computing.pdf/content'},\n", " {'filename': '12623730_06_Chatbots.pdf',\n", " 'url': 'https://zenodo.org/api/records/12623730/files/06_Chatbots.pdf/content'}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def download_pdfs_from_zenodo(record_id):\n", " \"\"\"Download PDFs from Zenodo record.\"\"\"\n", " base_url = f\"https://zenodo.org/api/records/{record_id}\"\n", " response = requests.get(base_url)\n", " data = response.json()\n", " \n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", " \n", " files_info = []\n", " for file in data['files']:\n", " if file['key'].endswith('.pdf'):\n", " download_url = file['links']['self']\n", " filename = record_id + \"_\" + file['key']\n", " filepath = os.path.join('downloads', filename)\n", "\n", " if not os.path.exists(filepath):\n", " # Download file\n", " response = requests.get(download_url)\n", " with open(filepath, 'wb') as f:\n", " f.write(response.content)\n", " \n", " files_info.append({'filename': filename, 'url': download_url})\n", " \n", " return files_info\n", "\n", "\n", "# Download PDFs\n", "files_info = download_pdfs_from_zenodo('12623730')\n", "files_info" ] }, { "cell_type": "markdown", "id": "9737606c-f8f8-40c4-88f4-80acbd6609c7", "metadata": {}, "source": [ "Next we go through all pages, save them as PDF and take the text to create embedding vectors." ] }, { "cell_type": "code", "execution_count": 3, "id": "89890a80-5b2c-44f6-9acb-781ab82b79a4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\rober\\AppData\\Local\\Temp\\ipykernel_12900\\2891063455.py:19: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.\n", " return image.resize((new_width, height), Image.LANCZOS)\n" ] } ], "source": [ "\n", "def resize_image(image, height):\n", " \"\"\"\n", " Resize the image to the specified height while maintaining aspect ratio.\n", "\n", " Parameters\n", " ----------\n", " image : PIL.Image.Image\n", " The image to resize.\n", " height : int\n", " The desired height in pixels.\n", " \n", " Returns\n", " -------\n", " PIL.Image.Image\n", " The resized image.\n", " \"\"\"\n", " aspect_ratio = image.width / image.height\n", " new_width = int(aspect_ratio * height)\n", " return image.resize((new_width, height), Image.LANCZOS)\n", "\n", "def process_pdf(pdf_info):\n", " \"\"\"Process PDF file to extract images and text.\"\"\"\n", " filename = pdf_info['filename']\n", " filepath = os.path.join('downloads', filename)\n", " base_name = os.path.splitext(filename)[0]\n", " \n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", " if not os.path.exists('downloads'):\n", " os.makedirs('downloads')\n", "\n", " # Set your OpenAI API key\n", " client = OpenAI()\n", " \n", " # Convert PDF pages to images\n", " images = [resize_image(i, height=300) for i in convert_from_path(filepath)]\n", " \n", " # Extract text from PDF\n", " pdf_reader = PyPDF2.PdfReader(filepath)\n", " \n", " page_data = []\n", " \n", " for i, image in enumerate(images):\n", " # Save image\n", " png_filename = f\"{base_name}_{i}.png\"\n", " png_path = os.path.join('downloads', png_filename)\n", " image.save(png_path)\n", " \n", " # Save text\n", " txt_filename = f\"{base_name}_{i}.txt\"\n", " txt_path = os.path.join('downloads', txt_filename)\n", " text = pdf_reader.pages[i].extract_text()\n", " if not os.path.exists(txt_filename):\n", " with open(txt_path, 'w', encoding='utf-8') as f:\n", " f.write(text)\n", " \n", " # Get embedding\n", " response = client.embeddings.create(\n", " input=text,\n", " model=\"text-embedding-ada-002\"\n", " )\n", " embedding_vector = response.data[0].embedding\n", " \n", " page_data.append({\n", " 'filename': filename,\n", " 'url': pdf_info['url'],\n", " 'page_index': i,\n", " 'text': text,\n", " 'png_filename': png_filename,\n", " 'txt_filename': txt_filename,\n", " 'embedding_vector': embedding_vector\n", " })\n", " \n", " return page_data\n", "\n", "# Process all PDFs\n", "all_page_data = []\n", "for pdf_info in files_info:\n", " all_page_data.extend(process_pdf(pdf_info))" ] }, { "cell_type": "markdown", "id": "c363ce5e-8fa1-4d57-9e7f-fec4a3af590b", "metadata": {}, "source": [ "The result will be saved as dataframe." ] }, { "cell_type": "code", "execution_count": 4, "id": "fd3cd5fe-1b78-4a5c-94d8-497b3ef57f4a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | filename | \n", "url | \n", "page_index | \n", "text | \n", "png_filename | \n", "txt_filename | \n", "embedding_vector | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "0 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_0.png | \n", "12623730_14_Summary_0.txt | \n", "[-0.01753188483417034, 0.009571048431098461, 0... | \n", "
| 1 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "1 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_1.png | \n", "12623730_14_Summary_1.txt | \n", "[0.001144174369983375, 0.008919398300349712, -... | \n", "
| 2 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "2 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_2.png | \n", "12623730_14_Summary_2.txt | \n", "[0.01131830457597971, 0.033359214663505554, 0.... | \n", "
| 3 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "3 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_3.png | \n", "12623730_14_Summary_3.txt | \n", "[0.018105685710906982, 0.026488685980439186, 0... | \n", "
| 4 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "4 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_4.png | \n", "12623730_14_Summary_4.txt | \n", "[-0.027609605342149734, 0.0015738429501652718,... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 858 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "29 | \n", "Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_29.png | \n", "12623730_06_Chatbots_29.txt | \n", "[-0.011728818528354168, -0.0007099526119418442... | \n", "
| 859 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "30 | \n", "Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_30.png | \n", "12623730_06_Chatbots_30.txt | \n", "[-0.007209372241050005, 0.004134070128202438, ... | \n", "
| 860 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "31 | \n", "Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_31.png | \n", "12623730_06_Chatbots_31.txt | \n", "[-0.014446760527789593, 0.013194024562835693, ... | \n", "
| 861 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "32 | \n", "Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_32.png | \n", "12623730_06_Chatbots_32.txt | \n", "[-0.035361308604478836, -0.001887816353701055,... | \n", "
| 862 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "33 | \n", "Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_33.png | \n", "12623730_06_Chatbots_33.txt | \n", "[-0.006428151857107878, 0.016945311799645424, ... | \n", "
863 rows × 7 columns
\n", "| \n", " | filename | \n", "url | \n", "page_index | \n", "text | \n", "png_filename | \n", "txt_filename | \n", "embedding_vector | \n", "UMAP0 | \n", "UMAP1 | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "0 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_0.png | \n", "12623730_14_Summary_0.txt | \n", "[-0.01753188483417034, 0.009571048431098461, 0... | \n", "2.785299 | \n", "5.125338 | \n", "
| 1 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "1 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_1.png | \n", "12623730_14_Summary_1.txt | \n", "[0.001144174369983375, 0.008919398300349712, -... | \n", "1.759109 | \n", "5.196022 | \n", "
| 2 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "2 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_2.png | \n", "12623730_14_Summary_2.txt | \n", "[0.01131830457597971, 0.033359214663505554, 0.... | \n", "1.605859 | \n", "6.084491 | \n", "
| 3 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "3 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_3.png | \n", "12623730_14_Summary_3.txt | \n", "[0.018105685710906982, 0.026488685980439186, 0... | \n", "1.581907 | \n", "6.084695 | \n", "
| 4 | \n", "12623730_14_Summary.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "4 | \n", "Robert Haase\\n@haesleinhuepf\\nBIDS Lecture 14/... | \n", "12623730_14_Summary_4.png | \n", "12623730_14_Summary_4.txt | \n", "[-0.027609605342149734, 0.0015738429501652718,... | \n", "2.163119 | \n", "7.161102 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 858 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "29 | \n", "Slide 30\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_29.png | \n", "12623730_06_Chatbots_29.txt | \n", "[-0.011728818528354168, -0.0007099526119418442... | \n", "3.970344 | \n", "5.525424 | \n", "
| 859 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "30 | \n", "Slide 31\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_30.png | \n", "12623730_06_Chatbots_30.txt | \n", "[-0.007209372241050005, 0.004134070128202438, ... | \n", "5.693106 | \n", "7.587674 | \n", "
| 860 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "31 | \n", "Slide 32\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_31.png | \n", "12623730_06_Chatbots_31.txt | \n", "[-0.014446760527789593, 0.013194024562835693, ... | \n", "4.285961 | \n", "5.682843 | \n", "
| 861 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "32 | \n", "Slide 33\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_32.png | \n", "12623730_06_Chatbots_32.txt | \n", "[-0.035361308604478836, -0.001887816353701055,... | \n", "6.068371 | \n", "4.506485 | \n", "
| 862 | \n", "12623730_06_Chatbots.pdf | \n", "https://zenodo.org/api/records/12623730/files/... | \n", "33 | \n", "Slide 34\\nRobert Haase\\n@haesleinhuepf\\nBIDS L... | \n", "12623730_06_Chatbots_33.png | \n", "12623730_06_Chatbots_33.txt | \n", "[-0.006428151857107878, 0.016945311799645424, ... | \n", "4.330415 | \n", "5.363225 | \n", "
863 rows × 9 columns
\n", "