---
name: error-retry-tracking
description: Instrument error handling, retries, fallbacks, and failure patterns
triggers:
  - "error tracking"
  - "retry instrumentation"
  - "failure handling"
  - "fallback tracking"
  - "rate limit handling"
priority: 2
---

# Error and Retry Tracking

Instrument error handling to understand failure patterns and recovery behavior.

## Core Principle

Error observability answers:
1. **What failed** and why?
2. **How many retries** before success/failure?
3. **What fallbacks** were used?
4. **What's the recovery rate**?
5. **Are errors correlated** (rate limits, outages)?

## Error Classification

### Transient vs. Permanent
```python
TRANSIENT_ERRORS = [
    "RateLimitError",
    "TimeoutError",
    "ServiceUnavailable",
    "ConnectionError",
]

PERMANENT_ERRORS = [
    "InvalidRequestError",
    "AuthenticationError",
    "ContentPolicyViolation",
    "ContextLengthExceeded",
]

def classify_error(error: Exception) -> str:
    error_type = type(error).__name__
    if error_type in TRANSIENT_ERRORS:
        return "transient"
    elif error_type in PERMANENT_ERRORS:
        return "permanent"
    return "unknown"
```

## Error Span Attributes

```python
# Error identification (P0)
span.set_attribute("error.type", "RateLimitError")
span.set_attribute("error.message", "Rate limit exceeded")
span.set_attribute("error.category", "transient")
span.set_attribute("error.source", "llm_provider")

# Provider context (P1)
span.set_attribute("error.provider", "anthropic")
span.set_attribute("error.model", "claude-3-opus")
span.set_attribute("error.status_code", 429)
span.set_attribute("error.request_id", "req_abc123")

# Timing context (P1)
span.set_attribute("error.retry_after_ms", 60000)
span.set_attribute("error.occurred_at_step", 3)
span.set_attribute("error.time_into_request_ms", 2500)

# Impact (P2)
span.set_attribute("error.tokens_wasted", 1500)  # Tokens sent before failure
span.set_attribute("error.cost_wasted_usd", 0.015)
```

## Retry Span Attributes

```python
# Retry tracking (P0)
span.set_attribute("retry.attempt", 2)
span.set_attribute("retry.max_attempts", 3)
span.set_attribute("retry.strategy", "exponential_backoff")

# Timing (P1)
span.set_attribute("retry.delay_ms", 2000)
span.set_attribute("retry.total_wait_ms", 3500)
span.set_attribute("retry.jitter_ms", 150)

# Outcome (P0)
span.set_attribute("retry.success", True)
span.set_attribute("retry.final_attempt", 2)
span.set_attribute("retry.exhausted", False)
```

## Retry Wrapper Pattern

```python
from functools import wraps
from langfuse.decorators import observe
import time

def with_retry(
    max_attempts: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    exponential_base: float = 2.0,
):
    def decorator(func):
        @wraps(func)
        @observe(name=f"{func.__name__}.with_retry")
        def wrapper(*args, **kwargs):
            span = get_current_span()
            span.set_attribute("retry.max_attempts", max_attempts)
            span.set_attribute("retry.strategy", "exponential_backoff")

            last_error = None
            total_wait = 0

            for attempt in range(1, max_attempts + 1):
                try:
                    span.set_attribute("retry.attempt", attempt)
                    result = func(*args, **kwargs)
                    span.set_attribute("retry.success", True)
                    span.set_attribute("retry.final_attempt", attempt)
                    return result

                except Exception as e:
                    last_error = e
                    span.set_attribute("error.type", type(e).__name__)
                    span.set_attribute("error.category", classify_error(e))

                    if classify_error(e) == "permanent":
                        span.set_attribute("retry.exhausted", False)
                        span.set_attribute("retry.abort_reason", "permanent_error")
                        raise

                    if attempt < max_attempts:
                        delay = min(
                            base_delay * (exponential_base ** (attempt - 1)),
                            max_delay
                        )
                        total_wait += delay
                        span.add_event("retry.waiting", {"delay_ms": delay * 1000})
                        time.sleep(delay)

            span.set_attribute("retry.success", False)
            span.set_attribute("retry.exhausted", True)
            span.set_attribute("retry.total_wait_ms", total_wait * 1000)
            raise last_error

        return wrapper
    return decorator

@with_retry(max_attempts=3)
def call_llm(messages):
    return client.messages.create(messages=messages)
```

## Fallback Tracking

```python
# Fallback span attributes
span.set_attribute("fallback.triggered", True)
span.set_attribute("fallback.reason", "primary_model_unavailable")
span.set_attribute("fallback.from_model", "claude-3-opus")
span.set_attribute("fallback.to_model", "claude-3-sonnet")
span.set_attribute("fallback.quality_impact", "reduced")

# Fallback chain
span.set_attribute("fallback.chain", ["opus", "sonnet", "haiku"])
span.set_attribute("fallback.chain_position", 2)
```

## Rate Limit Handling

```python
# Rate limit specific attributes
span.set_attribute("rate_limit.type", "tokens_per_minute")
span.set_attribute("rate_limit.limit", 100000)
span.set_attribute("rate_limit.remaining", 0)
span.set_attribute("rate_limit.reset_at", "2024-01-15T10:01:00Z")
span.set_attribute("rate_limit.retry_after_ms", 45000)

# Proactive rate limiting
span.set_attribute("rate_limit.preemptive_wait", True)
span.set_attribute("rate_limit.tokens_queued", 5000)
```

## Circuit Breaker Pattern

```python
from enum import Enum

class CircuitState(Enum):
    CLOSED = "closed"      # Normal operation
    OPEN = "open"          # Failing, reject requests
    HALF_OPEN = "half_open"  # Testing recovery

# Circuit breaker attributes
span.set_attribute("circuit.state", "open")
span.set_attribute("circuit.failure_count", 5)
span.set_attribute("circuit.failure_threshold", 5)
span.set_attribute("circuit.last_failure_at", timestamp)
span.set_attribute("circuit.opens_at", timestamp)
span.set_attribute("circuit.half_open_attempts", 0)
```

## Error Aggregation

Track error patterns:
```python
# Per-session error summary
span.set_attribute("session.total_errors", 3)
span.set_attribute("session.transient_errors", 2)
span.set_attribute("session.permanent_errors", 1)
span.set_attribute("session.retry_success_rate", 0.67)

# Per-provider health
span.set_attribute("provider.health", "degraded")
span.set_attribute("provider.error_rate_1h", 0.05)
span.set_attribute("provider.avg_latency_1h_ms", 2500)
```

## Framework Integration

### LangChain Retry
```python
from langchain.chat_models import ChatAnthropic
from langfuse.callback import CallbackHandler

llm = ChatAnthropic(
    model="claude-3-opus",
    max_retries=3,
    request_timeout=30,
)

# Callbacks capture retry behavior
handler = CallbackHandler()
response = llm.invoke(messages, config={"callbacks": [handler]})
```

### Tenacity Integration
```python
from tenacity import retry, stop_after_attempt, wait_exponential
from langfuse.decorators import observe

@observe(name="llm.call")
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=60),
)
def call_with_tenacity(messages):
    return client.messages.create(messages=messages)
```

## Anti-Patterns

- Catching all exceptions silently (hidden failures)
- No retry tracking (can't optimize retry config)
- Missing error classification (can't distinguish transient vs. permanent)
- No fallback logging (unclear degradation)
- Retrying permanent errors (wasted cost)

## Related Skills
- `llm-call-tracing` - LLM error context
- `tool-call-tracking` - Tool error handling