---
name: pdf-harvester
description: Extract text and data from PDF documents
---


# PDF Harvester Skill

> Extract and ingest PDF documents into RAG with proper text extraction, table handling, and metadata.

## Overview

PDFs are common for research papers, reports, manuals, and ebooks. This skill covers:
- Text extraction with layout preservation
- Table extraction and conversion to markdown
- Academic paper patterns (abstract, sections, citations)
- OCR for scanned documents
- Multi-page chunking strategies

## Prerequisites

```bash
# Core extraction
pip install pdfplumber pymupdf

# For OCR (scanned documents)
pip install pytesseract pdf2image
# Also need: brew install tesseract poppler (macOS)

# For academic papers
pip install arxiv  # If fetching from arXiv
```

## Extraction Methods

### Method 1: pdfplumber (Recommended)

Best for structured PDFs with tables.

```python
#!/usr/bin/env python3
"""PDF extraction using pdfplumber."""

import pdfplumber
from pathlib import Path
from typing import Dict, List, Optional
import re

def extract_pdf_text(
    pdf_path: str,
    extract_tables: bool = True
) -> Dict:
    """
    Extract text and tables from PDF.

    Args:
        pdf_path: Path to PDF file
        extract_tables: Whether to extract tables separately

    Returns:
        Dict with pages, tables, and metadata
    """
    result = {
        "pages": [],
        "tables": [],
        "metadata": {},
        "total_pages": 0
    }

    with pdfplumber.open(pdf_path) as pdf:
        result["total_pages"] = len(pdf.pages)
        result["metadata"] = pdf.metadata or {}

        for page_num, page in enumerate(pdf.pages, 1):
            # Extract text
            text = page.extract_text() or ""

            result["pages"].append({
                "page_number": page_num,
                "text": text,
                "width": page.width,
                "height": page.height
            })

            # Extract tables
            if extract_tables:
                tables = page.extract_tables()
                for table_num, table in enumerate(tables, 1):
                    if table and len(table) > 0:
                        result["tables"].append({
                            "page_number": page_num,
                            "table_number": table_num,
                            "data": table,
                            "markdown": table_to_markdown(table)
                        })

    return result


def table_to_markdown(table: List[List]) -> str:
    """Convert table data to markdown format."""
    if not table or len(table) == 0:
        return ""

    # Clean cells
    def clean_cell(cell):
        if cell is None:
            return ""
        return str(cell).replace("
", " ").strip()

    # Header row
    headers = [clean_cell(c) for c in table[0]]
    md = "| " + " | ".join(headers) + " |
"
    md += "| " + " | ".join(["---"] * len(headers)) + " |
"

    # Data rows
    for row in table[1:]:
        cells = [clean_cell(c) for c in row]
        # Pad if necessary
        while len(cells) < len(headers):
            cells.append("")
        md += "| " + " | ".join(cells[:len(headers)]) + " |
"

    return md
```

### Method 2: PyMuPDF (fitz)

Faster, better for large PDFs.

```python
#!/usr/bin/env python3
"""PDF extraction using PyMuPDF."""

import fitz  # PyMuPDF
from typing import Dict, List

def extract_with_pymupdf(pdf_path: str) -> Dict:
    """
    Extract text using PyMuPDF.

    Faster than pdfplumber, good for large documents.
    """
    doc = fitz.open(pdf_path)

    result = {
        "pages": [],
        "metadata": doc.metadata,
        "total_pages": len(doc)
    }

    for page_num, page in enumerate(doc, 1):
        # Get text with layout preservation
        text = page.get_text("text")

        # Get text blocks for better structure
        blocks = page.get_text("dict")["blocks"]

        result["pages"].append({
            "page_number": page_num,
            "text": text,
            "blocks": len(blocks)
        })

    doc.close()
    return result


def extract_with_structure(pdf_path: str) -> Dict:
    """Extract with heading detection."""
    doc = fitz.open(pdf_path)

    pages = []
    for page_num, page in enumerate(doc, 1):
        blocks = page.get_text("dict")["blocks"]

        structured_content = []
        for block in blocks:
            if block["type"] == 0:  # Text block
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        text = span["text"].strip()
                        font_size = span["size"]
                        is_bold = "bold" in span["font"].lower()

                        # Detect headings by font size
                        if font_size > 14 or is_bold:
                            structured_content.append({
                                "type": "heading",
                                "text": text,
                                "size": font_size
                            })
                        else:
                            structured_content.append({
                                "type": "paragraph",
                                "text": text
                            })

        pages.append({
            "page_number": page_num,
            "content": structured_content
        })

    doc.close()
    return {"pages": pages, "total_pages": len(pages)}
```

### Method 3: OCR for Scanned PDFs

```python
#!/usr/bin/env python3
"""OCR extraction for scanned PDFs."""

import pytesseract
from pdf2image import convert_from_path
from typing import Dict, List

def extract_with_ocr(
    pdf_path: str,
    language: str = "eng",
    dpi: int = 300
) -> Dict:
    """
    Extract text from scanned PDF using OCR.

    Args:
        pdf_path: Path to PDF
        language: Tesseract language code
        dpi: Resolution for conversion
    """
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=dpi)

    pages = []
    for page_num, image in enumerate(images, 1):
        # Run OCR
        text = pytesseract.image_to_string(image, lang=language)

        pages.append({
            "page_number": page_num,
            "text": text,
            "ocr": True
        })

    return {
        "pages": pages,
        "total_pages": len(pages),
        "ocr_used": True
    }


def is_scanned_pdf(pdf_path: str) -> bool:
    """Detect if PDF is scanned (image-based)."""
    import fitz

    doc = fitz.open(pdf_path)

    # Check first few pages
    for page in doc[:min(3, len(doc))]:
        text = page.get_text().strip()
        if len(text) > 100:  # Has extractable text
            doc.close()
            return False

    doc.close()
    return True
```

## Chunking Strategies

### Strategy 1: Page-Based

Simple chunking by page boundaries.

```python
def chunk_by_pages(
    extracted: Dict,
    pages_per_chunk: int = 1
) -> List[Dict]:
    """Chunk PDF by page boundaries."""
    chunks = []
    pages = extracted["pages"]

    for i in range(0, len(pages), pages_per_chunk):
        page_group = pages[i:i + pages_per_chunk]

        text = "

".join(p["text"] for p in page_group)

        chunks.append({
            "content": text,
            "page_start": page_group[0]["page_number"],
            "page_end": page_group[-1]["page_number"],
            "chunk_index": len(chunks)
        })

    return chunks
```

### Strategy 2: Section-Based

Chunk by document sections/headings.

```python
def chunk_by_sections(
    extracted: Dict,
    heading_patterns: List[str] = None
) -> List[Dict]:
    """Chunk PDF by section headings."""
    if heading_patterns is None:
        heading_patterns = [
            r'^#+\s',                    # Markdown headings
            r'^\d+\.\s+[A-Z]',           # Numbered sections
            r'^[A-Z][A-Z\s]+$',          # ALL CAPS headings
            r'^(Abstract|Introduction|Conclusion|References)',
        ]

    full_text = "

".join(p["text"] for p in extracted["pages"])

    # Find section boundaries
    sections = []
    current_section = {"title": "Introduction", "content": "", "start_pos": 0}

    lines = full_text.split("
")

    for line in lines:
        is_heading = any(
            re.match(pattern, line.strip())
            for pattern in heading_patterns
        )

        if is_heading and current_section["content"].strip():
            sections.append(current_section)
            current_section = {
                "title": line.strip(),
                "content": "",
                "start_pos": len(sections)
            }
        else:
            current_section["content"] += line + "
"

    # Don't forget last section
    if current_section["content"].strip():
        sections.append(current_section)

    return [
        {
            "content": s["content"].strip(),
            "section": s["title"],
            "chunk_index": i
        }
        for i, s in enumerate(sections)
    ]
```

### Strategy 3: Semantic Paragraphs

Chunk by paragraph with size limits.

```python
def chunk_by_paragraphs(
    extracted: Dict,
    max_chunk_size: int = 500,  # words
    overlap: int = 50
) -> List[Dict]:
    """Chunk by paragraphs with overlap."""
    full_text = "

".join(p["text"] for p in extracted["pages"])

    # Split into paragraphs
    paragraphs = [p.strip() for p in full_text.split("

") if p.strip()]

    chunks = []
    current_chunk = []
    current_size = 0

    for para in paragraphs:
        para_size = len(para.split())

        if current_size + para_size > max_chunk_size and current_chunk:
            # Save current chunk
            chunks.append({
                "content": "

".join(current_chunk),
                "chunk_index": len(chunks),
                "word_count": current_size
            })

            # Start new chunk with overlap
            overlap_text = current_chunk[-1] if current_chunk else ""
            current_chunk = [overlap_text] if overlap_text else []
            current_size = len(overlap_text.split()) if overlap_text else 0

        current_chunk.append(para)
        current_size += para_size

    # Last chunk
    if current_chunk:
        chunks.append({
            "content": "

".join(current_chunk),
            "chunk_index": len(chunks),
            "word_count": current_size
        })

    return chunks
```

## Academic Paper Pattern

Special handling for research papers.

```python
def extract_academic_paper(pdf_path: str) -> Dict:
    """
    Extract academic paper with structure detection.

    Identifies: title, authors, abstract, sections, references
    """
    extracted = extract_pdf_text(pdf_path)
    full_text = "
".join(p["text"] for p in extracted["pages"])

    paper = {
        "title": "",
        "authors": [],
        "abstract": "",
        "sections": [],
        "references": [],
        "tables": extracted["tables"]
    }

    # Title is usually first large text
    lines = full_text.split("
")
    for line in lines[:10]:
        if len(line) > 20 and len(line) < 200:
            paper["title"] = line.strip()
            break

    # Abstract
    abstract_match = re.search(
        r'Abstract[:\s]*
?(.*?)(?=
(?:1\.?\s+)?Introduction|

[A-Z])',
        full_text,
        re.DOTALL | re.IGNORECASE
    )
    if abstract_match:
        paper["abstract"] = abstract_match.group(1).strip()

    # Sections
    section_pattern = r'
(\d+\.?\s+[A-Z][^
]+)
'
    section_matches = re.finditer(section_pattern, full_text)

    section_positions = [(m.group(1), m.start()) for m in section_matches]

    for i, (title, start) in enumerate(section_positions):
        end = section_positions[i+1][1] if i+1 < len(section_positions) else len(full_text)
        content = full_text[start:end]

        paper["sections"].append({
            "title": title.strip(),
            "content": content.strip()
        })

    # References section
    ref_match = re.search(
        r'(?:References|Bibliography)\s*
(.*?)$',
        full_text,
        re.DOTALL | re.IGNORECASE
    )
    if ref_match:
        paper["references_text"] = ref_match.group(1).strip()

    return paper
```

## Full Harvesting Pipeline

```python
#!/usr/bin/env python3
"""Complete PDF harvesting pipeline."""

from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import hashlib

async def harvest_pdf(
    pdf_path: str,
    collection: str,
    chunk_strategy: str = "paragraphs",  # pages, sections, paragraphs
    is_academic: bool = False,
    use_ocr: bool = False
) -> Dict:
    """
    Harvest a PDF document into RAG.

    Args:
        pdf_path: Path to PDF file
        collection: Target RAG collection
        chunk_strategy: How to chunk the document
        is_academic: Use academic paper extraction
        use_ocr: Force OCR extraction
    """
    path = Path(pdf_path)

    # Check if OCR needed
    if use_ocr or is_scanned_pdf(pdf_path):
        extracted = extract_with_ocr(pdf_path)
    else:
        extracted = extract_pdf_text(pdf_path)

    # Get document metadata
    doc_metadata = {
        "source_type": "pdf",
        "source_path": str(path.absolute()),
        "filename": path.name,
        "total_pages": extracted["total_pages"],
        "harvested_at": datetime.now().isoformat(),
        "pdf_metadata": extracted.get("metadata", {})
    }

    # Academic paper special handling
    if is_academic:
        paper = extract_academic_paper(pdf_path)
        doc_metadata["title"] = paper["title"]
        doc_metadata["abstract"] = paper["abstract"]
        doc_metadata["is_academic"] = True

    # Chunk based on strategy
    if chunk_strategy == "pages":
        chunks = chunk_by_pages(extracted)
    elif chunk_strategy == "sections":
        chunks = chunk_by_sections(extracted)
    else:
        chunks = chunk_by_paragraphs(extracted)

    # Generate document ID from content hash
    content_hash = hashlib.md5(
        "".join(p["text"] for p in extracted["pages"]).encode()
    ).hexdigest()[:12]
    doc_id = f"pdf_{content_hash}"

    # Ingest chunks
    ingested = 0
    for chunk in chunks:
        chunk_metadata = {
            **doc_metadata,
            "chunk_index": chunk["chunk_index"],
            "total_chunks": len(chunks),
        }

        # Add page info if available
        if "page_start" in chunk:
            chunk_metadata["page_start"] = chunk["page_start"]
            chunk_metadata["page_end"] = chunk["page_end"]

        # Add section info if available
        if "section" in chunk:
            chunk_metadata["section"] = chunk["section"]

        await ingest(
            content=chunk["content"],
            collection=collection,
            metadata=chunk_metadata,
            doc_id=f"{doc_id}_chunk_{chunk['chunk_index']}"
        )
        ingested += 1

    # Ingest tables separately
    for table in extracted.get("tables", []):
        table_metadata = {
            **doc_metadata,
            "content_type": "table",
            "page_number": table["page_number"],
            "table_number": table["table_number"]
        }

        await ingest(
            content=table["markdown"],
            collection=collection,
            metadata=table_metadata,
            doc_id=f"{doc_id}_table_{table['page_number']}_{table['table_number']}"
        )

    return {
        "status": "success",
        "filename": path.name,
        "pages": extracted["total_pages"],
        "chunks": ingested,
        "tables": len(extracted.get("tables", [])),
        "collection": collection,
        "doc_id": doc_id
    }


async def harvest_pdf_url(
    url: str,
    collection: str,
    **kwargs
) -> Dict:
    """Download and harvest a PDF from URL."""
    import httpx
    import tempfile

    # Download PDF
    async with httpx.AsyncClient() as client:
        response = await client.get(url, follow_redirects=True)
        response.raise_for_status()

    # Save to temp file
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
        f.write(response.content)
        temp_path = f.name

    try:
        result = await harvest_pdf(temp_path, collection, **kwargs)
        result["source_url"] = url
        return result
    finally:
        Path(temp_path).unlink()  # Clean up
```

## Metadata Schema

```yaml
# PDF chunk metadata
source_type: pdf
source_path: /path/to/document.pdf
source_url: https://... (if downloaded)
filename: document.pdf
total_pages: 45
page_start: 5
page_end: 7
section: "3. Methodology"
chunk_index: 12
total_chunks: 28
harvested_at: "2024-01-01T12:00:00Z"
is_academic: true
title: "Paper Title"
abstract: "Paper abstract..."
content_type: text|table
```

## Usage Examples

```python
# Local PDF
result = await harvest_pdf(
    pdf_path="/path/to/document.pdf",
    collection="research_papers",
    chunk_strategy="sections",
    is_academic=True
)

# PDF from URL
result = await harvest_pdf_url(
    url="https://arxiv.org/pdf/2301.00001.pdf",
    collection="ml_papers",
    is_academic=True
)

# Scanned document
result = await harvest_pdf(
    pdf_path="/path/to/scanned.pdf",
    collection="legacy_docs",
    use_ocr=True
)
```

## Refinement Notes

> Track improvements as you use this skill.

- [ ] Text extraction tested
- [ ] Table extraction working
- [ ] OCR fallback tested
- [ ] Academic paper pattern validated
- [ ] Chunking strategies compared
- [ ] Large PDF handling optimized