{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b3ffd4e-1f4b-41a0-bb57-c1f95fb9eb64",
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "import docx\n",
    "from typing import List, Dict\n",
    "\n",
    "if 'data_loader' not in globals():\n",
    "    from mage_ai.data_preparation.decorators import data_loader\n",
    "if 'test' not in globals():\n",
    "    from mage_ai.data_preparation.decorators import test\n",
    "\n",
    "def clean_line(line: str) -> str:\n",
    "    line = line.strip()\n",
    "    line = line.strip('\\uFEFF')\n",
    "    return line\n",
    "\n",
    "def read_faq(file_id: str) -> List[Dict]:\n",
    "    \"\"\"\n",
    "    Fetch and parse the FAQ document from Google Docs.\n",
    "    \n",
    "    Args:\n",
    "        file_id (str): Google Docs file ID to be downloaded and processed.\n",
    "\n",
    "    Returns:\n",
    "        List[Dict]: A list of dictionaries containing the parsed questions and answers.\n",
    "    \"\"\"\n",
    "    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'\n",
    "    \n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status()\n",
    "    \n",
    "    with io.BytesIO(response.content) as f_in:\n",
    "        doc = docx.Document(f_in)\n",
    "\n",
    "    questions = []\n",
    "\n",
    "    question_heading_style = 'heading 2'\n",
    "    section_heading_style = 'heading 1'\n",
    "    \n",
    "    section_title = ''\n",
    "    question_title = ''\n",
    "    answer_text_so_far = ''\n",
    "     \n",
    "    for p in doc.paragraphs:\n",
    "        style = p.style.name.lower()\n",
    "        p_text = clean_line(p.text)\n",
    "    \n",
    "        if len(p_text) == 0:\n",
    "            continue\n",
    "    \n",
    "        if style == section_heading_style:\n",
    "            section_title = p_text\n",
    "            continue\n",
    "    \n",
    "        if style == question_heading_style:\n",
    "            answer_text_so_far = answer_text_so_far.strip()\n",
    "            if answer_text_so_far and section_title and question_title:\n",
    "                questions.append({\n",
    "                    'text': answer_text_so_far,\n",
    "                    'section': section_title,\n",
    "                    'question': question_title,\n",
    "                })\n",
    "                answer_text_so_far = ''\n",
    "    \n",
    "            question_title = p_text\n",
    "            continue\n",
    "        \n",
    "        answer_text_so_far += '\\n' + p_text\n",
    "    \n",
    "    answer_text_so_far = answer_text_so_far.strip()\n",
    "    if answer_text_so_far and section_title and question_title:\n",
    "        questions.append({\n",
    "            'text': answer_text_so_far,\n",
    "            'section': section_title,\n",
    "            'question': question_title,\n",
    "        })\n",
    "\n",
    "    return questions\n",
    "\n",
    "@data_loader\n",
    "def process_faq_documents(*args, **kwargs) -> List[Dict]:\n",
    "    \"\"\"\n",
    "    Fetch and process FAQ documents from Google Docs.\n",
    "\n",
    "    Args:\n",
    "        *args: Variable length argument list.\n",
    "        **kwargs: Arbitrary keyword arguments.\n",
    "\n",
    "    Keyword Args:\n",
    "        faq_documents (Dict[str, str]): A dictionary where keys are course names and values are Google Docs file IDs.\n",
    "\n",
    "    Returns:\n",
    "        List[Dict]: A list of dictionaries containing course names and their associated documents.\n",
    "    \"\"\"\n",
    "    faq_documents = kwargs.get('faq_documents', {\n",
    "        'another-course': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',  # Example document\n",
    "    })\n",
    "    \n",
    "    documents = []\n",
    "    \n",
    "    for course, file_id in faq_documents.items():\n",
    "        course_documents = read_faq(file_id)\n",
    "        documents.append({'course': course, 'documents': course_documents})\n",
    "    \n",
    "    # Log the number of documents processed\n",
    "    print(f\"Processed {len(documents)} documents\")\n",
    "    \n",
    "    return documents\n",
    "\n",
    "@test\n",
    "def test_process_faq_documents(output, *args) -> None:\n",
    "    \"\"\"\n",
    "    Test function for the process_faq_documents block.\n",
    "    \"\"\"\n",
    "    assert len(output) > 0, 'No documents were processed'\n",
    "    for doc in output:\n",
    "        assert 'course' in doc and 'documents' in doc, 'Processed document missing required keys'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b76bd8ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "import hashlib\n",
    "from typing import Any, Dict, List\n",
    "\n",
    "def generate_document_id(doc: Dict[str, Any]) -> str:\n",
    "    \"\"\"\n",
    "    Generate a unique document ID based on course, question, and a portion of the text.\n",
    "    \"\"\"\n",
    "    combined = f\"{doc['course']}-{doc['question']}-{doc['text'][:10]}\"\n",
    "    hash_object = hashlib.md5(combined.encode())\n",
    "    hash_hex = hash_object.hexdigest()\n",
    "    document_id = hash_hex[:8]\n",
    "    return document_id\n",
    "\n",
    "@transformer\n",
    "def chunk_documents(data: Any, *args, **kwargs) -> List[Dict[str, Any]]:\n",
    "    \"\"\"\n",
    "    Transform the documents by adding a course name and generating a unique document ID.\n",
    "    This assumes that the documents already have well-defined boundaries and no chunking is needed.\n",
    "    \"\"\"\n",
    "    documents = []\n",
    "\n",
    "    for doc in data['documents']:\n",
    "        doc['course'] = data['course']\n",
    "        # previously we used just \"id\"  for document ID\n",
    "        doc['document_id'] = generate_document_id(doc)\n",
    "        documents.append(doc)\n",
    "\n",
    "    print(len(documents))\n",
    "\n",
    "    print(documents)\n",
    "\n",
    "    return documents\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}