---
name: knowledge-ingestion-patterns
description: Patterns for ingesting knowledge into vector databases and RAG systems
---


# Knowledge Ingestion Patterns Skill

> Systematic approaches for ingesting different content types into RAG with optimal chunking, metadata, and retrieval quality.

## Overview

Different content types require different ingestion strategies. This skill documents best practices for:
- Websites and web content
- PDF documents
- Code repositories
- Conversation exports
- Research notes
- API documentation

## Core Principles

1. **Chunk for retrieval** - Optimize chunk size for the questions you'll ask
2. **Metadata matters** - Rich metadata enables filtered search
3. **Preserve context** - Don't lose meaning when splitting
4. **Deduplicate** - Avoid ingesting the same content twice

## Content Type Patterns


### Pattern 2: PDF Documents

**When to use**: Research papers, reports, ebooks, scanned documents

**Chunking Strategy**: Page-aware with overlap, handle tables/figures specially

```python
import fitz  # PyMuPDF
from typing import List, Dict

def chunk_pdf(pdf_path: str, chunk_size: int = 500) -> List[Dict]:
    """Extract and chunk PDF content with page awareness."""
    doc = fitz.open(pdf_path)
    chunks = []

    for page_num, page in enumerate(doc, 1):
        text = page.get_text()

        # Skip empty pages
        if not text.strip():
            continue

        # Split into paragraphs
        paragraphs = text.split('

')

        current_chunk = ""
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            if len(current_chunk) + len(para) < chunk_size:
                current_chunk += " " + para
            else:
                if current_chunk:
                    chunks.append({
                        "content": current_chunk.strip(),
                        "metadata": {
                            "type": "pdf",
                            "source": pdf_path,
                            "page": page_num,
                            "total_pages": len(doc)
                        }
                    })
                current_chunk = para

        # Don't forget last chunk of page
        if current_chunk:
            chunks.append({
                "content": current_chunk.strip(),
                "metadata": {
                    "type": "pdf",
                    "source": pdf_path,
                    "page": page_num,
                    "total_pages": len(doc)
                }
            })

    return chunks

def extract_pdf_tables(pdf_path: str) -> List[Dict]:
    """Extract tables from PDF as separate chunks."""
    import pdfplumber

    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            for table_num, table in enumerate(page.extract_tables(), 1):
                # Convert table to markdown format
                if table:
                    headers = table[0]
                    rows = table[1:]

                    md_table = "| " + " | ".join(str(h) for h in headers) + " |
"
                    md_table += "| " + " | ".join("---" for _ in headers) + " |
"
                    for row in rows:
                        md_table += "| " + " | ".join(str(c) for c in row) + " |
"

                    tables.append({
                        "content": md_table,
                        "metadata": {
                            "type": "pdf_table",
                            "source": pdf_path,
                            "page": page_num,
                            "table_number": table_num
                        }
                    })

    return tables
```

**Metadata Schema**:
```yaml
type: pdf | pdf_table
source: file path
page: page number
total_pages: document length
table_number: (for tables) which table on page
```


### Pattern 4: Websites / Web Content

**When to use**: Documentation sites, articles, blog posts

**Chunking Strategy**: Clean HTML, respect structure, handle navigation

```python
import httpx
from bs4 import BeautifulSoup
from typing import List, Dict
from urllib.parse import urljoin, urlparse

def chunk_webpage(url: str) -> List[Dict]:
    """Fetch and chunk a webpage."""
    response = httpx.get(url, follow_redirects=True)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove noise
    for tag in soup.find_all(['nav', 'footer', 'aside', 'script', 'style']):
        tag.decompose()

    chunks = []

    # Find main content
    main = soup.find('main') or soup.find('article') or soup.find('body')

    # Chunk by sections
    for section in main.find_all(['section', 'div'], class_=lambda x: x and 'content' in str(x).lower()):
        text = section.get_text(separator=' ', strip=True)
        if len(text) > 100:  # Skip tiny sections
            chunks.append({
                "content": text,
                "metadata": {
                    "type": "webpage",
                    "source": url,
                    "domain": urlparse(url).netloc,
                    "title": soup.title.string if soup.title else ""
                }
            })

    # If no sections found, chunk the whole page
    if not chunks:
        text = main.get_text(separator=' ', strip=True)
        # Split into ~500 word chunks
        words = text.split()
        for i in range(0, len(words), 450):
            chunk_text = ' '.join(words[i:i+500])
            chunks.append({
                "content": chunk_text,
                "metadata": {
                    "type": "webpage",
                    "source": url,
                    "domain": urlparse(url).netloc,
                    "title": soup.title.string if soup.title else ""
                }
            })

    return chunks


async def crawl_site(start_url: str, max_pages: int = 50) -> List[Dict]:
    """Crawl a site and chunk all pages."""
    from urllib.parse import urlparse

    base_domain = urlparse(start_url).netloc
    visited = set()
    to_visit = [start_url]
    all_chunks = []

    async with httpx.AsyncClient() as client:
        while to_visit and len(visited) < max_pages:
            url = to_visit.pop(0)
            if url in visited:
                continue

            try:
                response = await client.get(url, follow_redirects=True)
                visited.add(url)

                # Chunk this page
                all_chunks.extend(chunk_webpage(url))

                # Find links to follow
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    href = urljoin(url, link['href'])
                    if urlparse(href).netloc == base_domain and href not in visited:
                        to_visit.append(href)

            except Exception as e:
                print(f"Failed to fetch {url}: {e}")

    return all_chunks
```

**Metadata Schema**:
```yaml
type: webpage
source: full URL
domain: domain name
title: page title
crawl_depth: (for crawls) how many links from start
```


### Pattern 6: Research Notes

**When to use**: Personal notes, research findings, learnings

**Chunking Strategy**: By paragraph with topic extraction

```python
from typing import List, Dict
from datetime import datetime

def chunk_research_notes(content: str, topic: str = None) -> List[Dict]:
    """Chunk research notes with topic awareness."""

    # Split by double newlines (paragraphs)
    paragraphs = [p.strip() for p in content.split('

') if p.strip()]

    chunks = []
    current_topic = topic or "general"

    for para in paragraphs:
        # Check if this is a topic header
        if para.startswith('#') or (len(para) < 50 and para.endswith(':')):
            current_topic = para.strip('#: ')
            continue

        chunks.append({
            "content": para,
            "metadata": {
                "type": "research",
                "topic": current_topic,
                "ingested_at": datetime.now().isoformat(),
                "word_count": len(para.split())
            }
        })

    return chunks


def chunk_with_source_attribution(
    content: str,
    source_url: str = None,
    source_title: str = None,
    researcher: str = None
) -> List[Dict]:
    """Chunk research with full source attribution."""

    chunks = chunk_research_notes(content)

    for chunk in chunks:
        chunk["metadata"].update({
            "source_url": source_url,
            "source_title": source_title,
            "researcher": researcher
        })

    return chunks
```

**Metadata Schema**:
```yaml
type: research
topic: extracted or assigned topic
source_url: where the info came from
source_title: title of source
researcher: who did the research
ingested_at: timestamp
word_count: chunk size
```