{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5b3ffd4e-1f4b-41a0-bb57-c1f95fb9eb64", "metadata": {}, "outputs": [], "source": [ "import io\n", "import requests\n", "import docx\n", "from typing import List, Dict\n", "\n", "if 'data_loader' not in globals():\n", " from mage_ai.data_preparation.decorators import data_loader\n", "if 'test' not in globals():\n", " from mage_ai.data_preparation.decorators import test\n", "\n", "def clean_line(line: str) -> str:\n", " line = line.strip()\n", " line = line.strip('\\uFEFF')\n", " return line\n", "\n", "def read_faq(file_id: str) -> List[Dict]:\n", " \"\"\"\n", " Fetch and parse the FAQ document from Google Docs.\n", " \n", " Args:\n", " file_id (str): Google Docs file ID to be downloaded and processed.\n", "\n", " Returns:\n", " List[Dict]: A list of dictionaries containing the parsed questions and answers.\n", " \"\"\"\n", " url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'\n", " \n", " response = requests.get(url)\n", " response.raise_for_status()\n", " \n", " with io.BytesIO(response.content) as f_in:\n", " doc = docx.Document(f_in)\n", "\n", " questions = []\n", "\n", " question_heading_style = 'heading 2'\n", " section_heading_style = 'heading 1'\n", " \n", " section_title = ''\n", " question_title = ''\n", " answer_text_so_far = ''\n", " \n", " for p in doc.paragraphs:\n", " style = p.style.name.lower()\n", " p_text = clean_line(p.text)\n", " \n", " if len(p_text) == 0:\n", " continue\n", " \n", " if style == section_heading_style:\n", " section_title = p_text\n", " continue\n", " \n", " if style == question_heading_style:\n", " answer_text_so_far = answer_text_so_far.strip()\n", " if answer_text_so_far and section_title and question_title:\n", " questions.append({\n", " 'text': answer_text_so_far,\n", " 'section': section_title,\n", " 'question': question_title,\n", " })\n", " answer_text_so_far = ''\n", " \n", " question_title = p_text\n", " continue\n", " \n", " answer_text_so_far += '\\n' + p_text\n", " \n", " answer_text_so_far = answer_text_so_far.strip()\n", " if answer_text_so_far and section_title and question_title:\n", " questions.append({\n", " 'text': answer_text_so_far,\n", " 'section': section_title,\n", " 'question': question_title,\n", " })\n", "\n", " return questions\n", "\n", "@data_loader\n", "def process_faq_documents(*args, **kwargs) -> List[Dict]:\n", " \"\"\"\n", " Fetch and process FAQ documents from Google Docs.\n", "\n", " Args:\n", " *args: Variable length argument list.\n", " **kwargs: Arbitrary keyword arguments.\n", "\n", " Keyword Args:\n", " faq_documents (Dict[str, str]): A dictionary where keys are course names and values are Google Docs file IDs.\n", "\n", " Returns:\n", " List[Dict]: A list of dictionaries containing course names and their associated documents.\n", " \"\"\"\n", " faq_documents = kwargs.get('faq_documents', {\n", " 'another-course': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E', # Example document\n", " })\n", " \n", " documents = []\n", " \n", " for course, file_id in faq_documents.items():\n", " course_documents = read_faq(file_id)\n", " documents.append({'course': course, 'documents': course_documents})\n", " \n", " # Log the number of documents processed\n", " print(f\"Processed {len(documents)} documents\")\n", " \n", " return documents\n", "\n", "@test\n", "def test_process_faq_documents(output, *args) -> None:\n", " \"\"\"\n", " Test function for the process_faq_documents block.\n", " \"\"\"\n", " assert len(output) > 0, 'No documents were processed'\n", " for doc in output:\n", " assert 'course' in doc and 'documents' in doc, 'Processed document missing required keys'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b76bd8ce", "metadata": {}, "outputs": [], "source": [ "import hashlib\n", "from typing import Any, Dict, List\n", "\n", "def generate_document_id(doc: Dict[str, Any]) -> str:\n", " \"\"\"\n", " Generate a unique document ID based on course, question, and a portion of the text.\n", " \"\"\"\n", " combined = f\"{doc['course']}-{doc['question']}-{doc['text'][:10]}\"\n", " hash_object = hashlib.md5(combined.encode())\n", " hash_hex = hash_object.hexdigest()\n", " document_id = hash_hex[:8]\n", " return document_id\n", "\n", "@transformer\n", "def chunk_documents(data: Any, *args, **kwargs) -> List[Dict[str, Any]]:\n", " \"\"\"\n", " Transform the documents by adding a course name and generating a unique document ID.\n", " This assumes that the documents already have well-defined boundaries and no chunking is needed.\n", " \"\"\"\n", " documents = []\n", "\n", " for doc in data['documents']:\n", " doc['course'] = data['course']\n", " # previously we used just \"id\" for document ID\n", " doc['document_id'] = generate_document_id(doc)\n", " documents.append(doc)\n", "\n", " print(len(documents))\n", "\n", " print(documents)\n", "\n", " return documents\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }