#version: "3.9"          # Supported by both podman-compose and Docker Compose v2+

###############################################################################
#  HOST SYSTEM TUNING FOR LOAD TESTING (run before docker compose up)
#  See docs/docs/testing/performance.md for full details
#
#  One-liner (TCP + VM + I/O tuning):
#  sudo sysctl -w net.core.somaxconn=65535 net.core.netdev_max_backlog=65535 net.ipv4.tcp_max_syn_backlog=65535 net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=15 net.ipv4.ip_local_port_range="1024 65535" vm.swappiness=10 fs.aio-max-nr=1048576
#
#  Make persistent: sudo tee /etc/sysctl.d/99-mcp-loadtest.conf (see docs)
###############################################################################

###############################################################################
#  DOCKER COMPOSE PROFILES
#
#  Default (no profile):  Gateway + Postgres + Redis + Nginx (HTTP only)
#  --profile monitoring:  Adds Prometheus, Grafana, Loki, exporters
#  --profile benchmark:   Adds benchmark MCP servers for load testing
#  --profile tls:         Enables HTTPS via nginx_tls (auto-generates certs)
#
#  TLS Quick Start:
#    make compose-tls                    # HTTP:8080 + HTTPS:8443
#    make compose-tls-https              # Force HTTPS (HTTP redirects)
#    curl -sk https://localhost:8443/health
#
#  Custom certificates:
#    mkdir -p certs && cp your-cert.pem certs/cert.pem && cp your-key.pem certs/key.pem
#    make compose-tls
#
#  Environment variables (TLS profile):
#    NGINX_FORCE_HTTPS=true              # Redirect all HTTP to HTTPS
###############################################################################

###############################################################################
#  NETWORKS + VOLUMES - declared first so they can be referenced later
###############################################################################
networks:
  mcpnet:               # Single user-defined bridge network keeps traffic private
    driver: bridge

volumes:                # Named volumes survive podman-compose down/up
  pgdata:
  # pgdata18:  # Enable for postgres 18+
  pgadmindata:
  redisinsight_data:
  nginx_cache:
  grafanadata:
  prometheusdata:
  lokidata:

###############################################################################
#  CORE SERVICE - MCP Gateway
###############################################################################
services:

  # ──────────────────────────────────────────────────────────────────────
  # Nginx Caching Proxy - High-performance reverse proxy with CDN-like caching
  # ──────────────────────────────────────────────────────────────────────
  nginx:
    build:
      context: ./infra/nginx
      dockerfile: Dockerfile
    image: mcpgateway/nginx-cache:latest
    restart: unless-stopped
    ports:
      - "8080:80"                   # HTTP caching proxy (public-facing)
      # - "8443:443"                  # HTTPS caching proxy (public-facing)
    networks: [mcpnet]
    depends_on:
      gateway:
        condition: service_healthy
    volumes:
      - nginx_cache:/var/cache/nginx    # Persistent cache storage
      - ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro  # Mount config as read-only
      # - ./certs:/app/certs:ro           # Mount SSL certs for HTTPS backend verification
    # TCP kernel tuning for 3000 concurrent connections
    # Note: net.core.* sysctls are host-level and cannot be set per-container
    # Only net.ipv4.* sysctls that are network-namespace aware work here
    sysctls:
      - net.ipv4.tcp_fin_timeout=15          # Faster cleanup of FIN_WAIT2 sockets
      - net.ipv4.ip_local_port_range=1024 65535  # More ephemeral ports for upstream
    ulimits:
      nofile:
        soft: 65535
        hard: 65535
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 1G
        reservations:
          cpus: '2'
          memory: 512M

  # ──────────────────────────────────────────────────────────────────────
  # MCP Gateway - the main API server for the MCP stack
  # ──────────────────────────────────────────────────────────────────────
  gateway:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} # Use the local latest image. Run `make docker-prod` to build it.
    #image: ghcr.io/ibm/mcp-context-forge:1.0.0-BETA-1 # Use the release MCP Context Forge image
    #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0
    build:
      context: .
      dockerfile: Containerfile.lite     # Same one the Makefile builds
    restart: unless-stopped
    # NOTE: When using replicas > 1, access via nginx:8080 instead of direct port 4444
    # ports:
    #   - "4444:4444"               # Disabled for multi-replica mode
    networks: [mcpnet]

    # ──────────────────────────────────────────────────────────────────────
    # Environment - pick ONE database URL line, comment the rest
    # ──────────────────────────────────────────────────────────────────────
    environment:
      # ═══════════════════════════════════════════════════════════════════════════
      # HTTP Server Selection: gunicorn vs granian
      # ═══════════════════════════════════════════════════════════════════════════
      # Performance comparison (2500 concurrent users, PostgreSQL backend):
      #   Gunicorn: ~2.7GB RAM, ~740% CPU, no backpressure (queues unbounded)
      #   Granian:  ~4.0GB RAM, ~680% CPU, native backpressure (rejects excess with 503)
      #
      # Choose Gunicorn for: memory-constrained environments (32% less RAM)
      # Choose Granian for:  load spike protection, bursty traffic (graceful degradation)
      # Both achieve same RPS when database is the bottleneck.
      # ═══════════════════════════════════════════════════════════════════════════
      # - HTTP_SERVER=granian    # Rust-based, native backpressure, +47% memory, -8% CPU
      - HTTP_SERVER=gunicorn # Python-based, battle-tested, lower memory usage
      - HOST=0.0.0.0
      - PORT=4444
      # Domain for CORS/cookies (nginx default at http://localhost:8080)
      - APP_DOMAIN=${APP_DOMAIN:-http://localhost:8080}
      # Transport: sse, streamablehttp, http, or all (default: all)
      - TRANSPORT_TYPE=streamablehttp
      # Database connection: Via PgBouncer (default) or direct PostgreSQL
      # PgBouncer provides connection pooling for better performance under high concurrency
      - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/mcp
      # Direct PostgreSQL connection (bypass PgBouncer - increase DB_POOL_SIZE if using):
      # - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
      # SQLAlchemy query logging (useful for N+1 detection; noisy under load)
      # NOTE: SQLALCHEMY_ECHO logs at INFO; set LOG_LEVEL=INFO/DEBUG to see output.
      - SQLALCHEMY_ECHO=false
      - CACHE_TYPE=redis # backend for caching (memory, redis, database, or none)
      - REDIS_URL=redis://redis:6379/0
      # Redis parser: hiredis (C extension ~83x faster for large responses)
      - REDIS_PARSER=hiredis
      # Redis connection pool tuning for load testing (3 replicas × 24 workers × 100 = 7200 < 10000 maxclients)
      - REDIS_MAX_CONNECTIONS=100
      - REDIS_SOCKET_TIMEOUT=5.0
      - REDIS_SOCKET_CONNECT_TIMEOUT=5.0
      - REDIS_HEALTH_CHECK_INTERVAL=30
      # ═══════════════════════════════════════════════════════════════════════════
      # Redis Startup Resilience (prevents crash-loop on Redis outage)
      # ═══════════════════════════════════════════════════════════════════════════
      # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s...
      # 30 retries = ~5 minutes total wait before worker gives up
      - REDIS_MAX_RETRIES=30             # Max attempts before worker exits (default: 30)
      - REDIS_RETRY_INTERVAL_MS=2000     # Base interval, grows exponentially with jitter
      - REDIS_MAX_BACKOFF_SECONDS=30     # Max backoff cap (jitter ±25% applied after)
      # Auth Cache Configuration (reduces DB queries per auth request from 3-4 to 0-1)
      - AUTH_CACHE_ENABLED=${AUTH_CACHE_ENABLED:-true}
      - AUTH_CACHE_USER_TTL=300
      - AUTH_CACHE_REVOCATION_TTL=120
      - AUTH_CACHE_TEAM_TTL=300
      - AUTH_CACHE_ROLE_TTL=300
      - AUTH_CACHE_BATCH_QUERIES=true
      - AUTH_CACHE_TEAMS_TTL=300
      # Registry Cache Configuration (reduces DB queries for list endpoints)
      - REGISTRY_CACHE_ENABLED=true
      - REGISTRY_CACHE_TOOLS_TTL=300
      - REGISTRY_CACHE_PROMPTS_TTL=300
      - REGISTRY_CACHE_RESOURCES_TTL=300
      - REGISTRY_CACHE_AGENTS_TTL=300
      - REGISTRY_CACHE_SERVERS_TTL=300
      - REGISTRY_CACHE_GATEWAYS_TTL=300
      - REGISTRY_CACHE_CATALOG_TTL=300
      # Admin Stats Cache Configuration (reduces aggregate queries for dashboard)
      - ADMIN_STATS_CACHE_ENABLED=true
      - ADMIN_STATS_CACHE_SYSTEM_TTL=60
      - ADMIN_STATS_CACHE_OBSERVABILITY_TTL=30
      - ADMIN_STATS_CACHE_TAGS_TTL=120
      - ADMIN_STATS_CACHE_PLUGINS_TTL=120
      - ADMIN_STATS_CACHE_PERFORMANCE_TTL=60
      # Team member count cache (reduces N+1 queries)
      - TEAM_MEMBER_COUNT_CACHE_ENABLED=true
      - TEAM_MEMBER_COUNT_CACHE_TTL=300
      # Metrics aggregation cache (reduces full table scans, see #1906)
      - METRICS_CACHE_ENABLED=true
      - METRICS_CACHE_TTL_SECONDS=120
      # MCP Server Health Check
      # Interval in seconds between health checks (default: 300)
      - HEALTH_CHECK_INTERVAL=300
      # Timeout in seconds for each health check request (default: 5)
      - HEALTH_CHECK_TIMEOUT=5
      # Consecutive failures before marking gateway offline (default: 3)
      - UNHEALTHY_THRESHOLD=3
      # Gateway URL validation timeout in seconds (default: 5)
      - GATEWAY_VALIDATION_TIMEOUT=5
      # Max concurrent health checks per worker (default: 10)
      - MAX_CONCURRENT_HEALTH_CHECKS=10
      # JWT Configuration - Choose ONE approach:
      # Option 1: HMAC (Default - Simple deployments)
      - JWT_ALGORITHM=HS256
      - JWT_SECRET_KEY=my-test-key
      # Option 2: RSA (Production - Asymmetric, uncomment and generate certs)
      # - JWT_ALGORITHM=RS256
      # - JWT_PUBLIC_KEY_PATH=/app/certs/jwt/public.pem
      # - JWT_PRIVATE_KEY_PATH=/app/certs/jwt/private.pem
      - JWT_AUDIENCE=mcpgateway-api
      - JWT_ISSUER=mcpgateway
      # Basic auth is DISABLED by default for security (API_ALLOW_BASIC_AUTH=false)
      # Only set these if you explicitly enable Basic auth for backwards compatibility
      # - API_ALLOW_BASIC_AUTH=true
      # - BASIC_AUTH_USER=${BASIC_AUTH_USER:-admin}
      # - BASIC_AUTH_PASSWORD=${BASIC_AUTH_PASSWORD:-changeme}
      # Auth encryption secret + default user password
      - AUTH_ENCRYPTION_SECRET=${AUTH_ENCRYPTION_SECRET:-my-test-salt}
      - DEFAULT_USER_PASSWORD=${DEFAULT_USER_PASSWORD:-changeme}
      # Admin UI uses email/password authentication
      - EMAIL_AUTH_ENABLED=true
      - PLATFORM_ADMIN_EMAIL=admin@example.com
      - PLATFORM_ADMIN_PASSWORD=changeme
      # Security defaults (tokens require expiration and JTI for revocation)
      - REQUIRE_TOKEN_EXPIRATION=${REQUIRE_TOKEN_EXPIRATION:-true}
      - REQUIRE_JTI=${REQUIRE_JTI:-true}
      - REQUIRE_USER_IN_DB=${REQUIRE_USER_IN_DB:-false}
      - MCPGATEWAY_UI_ENABLED=true
      - MCPGATEWAY_ADMIN_API_ENABLED=true
      # Security configuration (using defaults)
      - ENVIRONMENT=development
      - SECURITY_HEADERS_ENABLED=true
      - CORS_ALLOW_CREDENTIALS=true
      - SECURE_COOKIES=false
      # ═══════════════════════════════════════════════════════════════════════════
      # SSRF Protection (Server-Side Request Forgery)
      # ═══════════════════════════════════════════════════════════════════════════
      # Prevents gateway from accessing internal resources or cloud metadata services.
      # Default: enabled with safe settings for dev/internal deployments.
      # Cloud metadata (169.254.169.254, etc.) is ALWAYS blocked by default.
      # - SSRF_PROTECTION_ENABLED=true           # Master switch (default: true)
      # - SSRF_ALLOW_LOCALHOST=true              # Allow localhost (default: true for dev)
      # - SSRF_ALLOW_PRIVATE_NETWORKS=true       # Allow 10.x, 172.16.x, 192.168.x (default: true)
      # - SSRF_DNS_FAIL_CLOSED=false             # Reject on DNS failure (default: false = fail open)
      # For strict production mode (external endpoints only):
      # - SSRF_ALLOW_LOCALHOST=false
      # - SSRF_ALLOW_PRIVATE_NETWORKS=false
      # - SSRF_DNS_FAIL_CLOSED=true

      # Uncomment to enable stateful sessions for Streamable HTTP transport
      # - USE_STATEFUL_SESSIONS=true

      # Uncomment to enable session affinity between downstream (from client) and upstrean (to MCP server) sessions
      # - MCPGATEWAY_SESSION_AFFINITY_ENABLED=true

      ## Uncomment to enable HTTPS (run `make certs` first)
      # - SSL=true
      # - CERT_FILE=/app/certs/cert.pem
      # - KEY_FILE=/app/certs/key.pem
      # For passphrase-protected keys: run `make certs-passphrase` and use:
      # - KEY_FILE=/app/certs/key-encrypted.pem
      # - KEY_FILE_PASSWORD=${KEY_FILE_PASSWORD}
      # Uncomment to enable plugins
      - PLUGINS_ENABLED=true
      # Uncomment to enable catalog
      - MCPGATEWAY_CATALOG_ENABLED=true
      - MCPGATEWAY_CATALOG_FILE=/app/mcp-catalog.yml
      # Authentication configuration
      - AUTH_REQUIRED=true
      - MCP_CLIENT_AUTH_ENABLED=true
      - TRUST_PROXY_AUTH=false
      # Logging configuration
      # NOTE: LOG_LEVEL=INFO/DEBUG is required for SQLALCHEMY_ECHO output.
      - LOG_LEVEL=${LOG_LEVEL:-ERROR}  # Required for SQLALCHEMY_ECHO output during load testing
      - DISABLE_ACCESS_LOG=true  # Disable uvicorn access logs for performance (massive I/O overhead)
      # Template auto-reload disabled for performance (prevents re-parsing templates on each request)
      - TEMPLATES_AUTO_RELOAD=false
      - STRUCTURED_LOGGING_DATABASE_ENABLED=false  # Disable DB logging for performance (use true only for debugging)
      # Audit trail logging - disabled by default for performance
      # WARNING: Causes a DB write on EVERY API request - can generate millions of rows during load testing!
      - AUDIT_TRAIL_ENABLED=false  # Set to true for compliance requirements (SOC2, HIPAA, etc.)
      # Security event logging - disabled by default for performance
      # WARNING: "all" level logs every request and causes massive DB write load
      - SECURITY_LOGGING_ENABLED=false  # Set to true to enable security event logging
      - SECURITY_LOGGING_LEVEL=failures_only  # Options: all, failures_only, high_severity
      # Performance optimizations - disable CPU-intensive middlewares
      # NOTE: Keep compression enabled when running without nginx that already has compression
      # Disabling causes throughput drop due to larger payloads
      - COMPRESSION_ENABLED=false
      # Disable optional middlewares for maximum throughput
      - VALIDATION_MIDDLEWARE_ENABLED=true
      - CORRELATION_ID_ENABLED=false
      - LLMCHAT_ENABLED=true
      - OBSERVABILITY_ENABLED=false
      # ═══════════════════════════════════════════════════════════════════════════
      # Database Connection Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Pool class options:
      #   - "null": NullPool - no application pooling, PgBouncer handles all pooling (recommended)
      #   - "queue": QueuePool - application-side pooling (use with direct PostgreSQL)
      #   - "auto": Automatic - NullPool if PgBouncer detected in URL, else QueuePool
      #
      # WITH PgBouncer (default in docker-compose):
      # Option A: NullPool - safest, eliminates stale connection errors, ~10% slower
      # - DB_POOL_CLASS=null
      # Option B: QueuePool + pre_ping - better performance, validates before use
      - DB_POOL_CLASS=queue
      - DB_POOL_PRE_PING=true          # Validate connections before use (SELECT 1)
      - DB_POOL_SIZE=20                # Pool size per worker
      - DB_MAX_OVERFLOW=10             # Extra connections under load
      - DB_POOL_TIMEOUT=60             # Time to wait for connection before failing
      - DB_POOL_RECYCLE=60             # Recycle before PgBouncer CLIENT_IDLE_TIMEOUT (half of 120s)
      # ═══════════════════════════════════════════════════════════════════════════
      # Database Startup Resilience (prevents crash-loop on DB outage)
      # ═══════════════════════════════════════════════════════════════════════════
      # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s...
      # 30 retries = ~5 minutes total wait before worker gives up
      - DB_MAX_RETRIES=30              # Max attempts before worker exits (default: 30)
      - DB_RETRY_INTERVAL_MS=2000      # Base interval, grows exponentially with jitter
      - DB_MAX_BACKOFF_SECONDS=30      # Max backoff cap (jitter ±25% applied after)
      # Tool configuration for high-concurrency load testing
      - TOOL_TIMEOUT=60               # Seconds before tool invocation times out
      - MAX_TOOL_RETRIES=3            # Retry attempts for failed tool invocations
      - TOOL_RATE_LIMIT=60000         # Max tool invocations per minute
      - TOOL_CONCURRENT_LIMIT=1000    # Max concurrent tool invocations
      - FEDERATION_TIMEOUT=30
      # ═══════════════════════════════════════════════════════════════════════════
      # HTTPX Client Connection Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Shared HTTP client for all outbound requests (federation, health checks,
      # A2A, SSO, catalog). Provides ~20x better performance than per-request clients.
      - HTTPX_MAX_CONNECTIONS=500           # Total connections in pool (increased from 200 for high concurrency)
      - HTTPX_MAX_KEEPALIVE_CONNECTIONS=300 # Keepalive connections (increased from 100)
      - HTTPX_KEEPALIVE_EXPIRY=30.0         # Idle connection expiry (seconds)
      - HTTPX_CONNECT_TIMEOUT=5.0           # TCP connection timeout (seconds)
      - HTTPX_READ_TIMEOUT=120.0            # Response read timeout (seconds, high for slow tools)
      - HTTPX_WRITE_TIMEOUT=30.0            # Request write timeout (seconds)
      - HTTPX_POOL_TIMEOUT=10.0             # Wait for available connection (seconds)
      - HTTPX_HTTP2_ENABLED=false           # HTTP/2 support (requires server support)
      - HTTPX_ADMIN_READ_TIMEOUT=30.0       # Admin UI/health check timeout (seconds)
      # ═══════════════════════════════════════════════════════════════════════════
      # Gunicorn Configuration (used when HTTP_SERVER=gunicorn)
      # ═══════════════════════════════════════════════════════════════════════════
      - GUNICORN_WORKERS=24                    # Worker processes (match CPU cores)
      - GUNICORN_TIMEOUT=120                   # Worker timeout in seconds
      - GUNICORN_GRACEFUL_TIMEOUT=60           # Grace period for worker shutdown
      - GUNICORN_KEEP_ALIVE=30                 # Keep-alive timeout (matches SSE keepalive)
      # Worker recycling cleans up MCP SDK stuck task groups (anyio#695 workaround)
      - GUNICORN_MAX_REQUESTS=1000000          # Recycle workers after 1M requests
      - GUNICORN_MAX_REQUESTS_JITTER=100000    # ±100000 jitter prevents thundering herd
      - GUNICORN_BACKLOG=4096                  # Connection queue depth
      # ═══════════════════════════════════════════════════════════════════════════
      # Granian Backpressure Configuration (used when HTTP_SERVER=granian)
      # ═══════════════════════════════════════════════════════════════════════════
      # Backpressure provides overload protection by rejecting excess requests with
      # immediate 503 responses instead of queuing them (which can cause OOM/timeouts).
      # Total capacity = GRANIAN_WORKERS × GRANIAN_BACKPRESSURE = 16 × 128 = 2048 concurrent
      # Requests beyond this limit receive immediate 503 (no queuing, no OOM)
      - GRANIAN_WORKERS=16
      - GRANIAN_BACKLOG=4096
      - GRANIAN_BACKPRESSURE=128
      - GRANIAN_HTTP1_BUFFER_SIZE=524288
      - GRANIAN_RESPAWN_FAILED=true
      # ───────────────────────────────────────────────────────────────────────
      # Granian Worker Lifecycle (recycling to prevent resource leaks)
      # ───────────────────────────────────────────────────────────────────────
      # Workaround for granian issue where SSE connections may not be properly
      # closed after client disconnect, causing CPU spin loops. See:
      # - https://github.com/emmett-framework/granian/issues/286
      # - https://github.com/IBM/mcp-context-forge/issues/2357
      #
      # GRANIAN_WORKERS_LIFETIME: Restart workers after this duration (min 60s)
      # GRANIAN_WORKERS_MAX_RSS: Restart workers exceeding this memory (MiB)
      #
      # Using both provides natural jitter - workers hit memory limits at
      # different times based on load, with lifetime as a backstop.
      # - GRANIAN_WORKERS_LIFETIME=3600    # 1 hour max worker lifetime
      # - GRANIAN_WORKERS_MAX_RSS=512      # 512 MiB max RSS per worker
      # ───────────────────────────────────────────────────────────────────────
      # HTTP/2: Granian supports native HTTP/2 multiplexing, but not useful here because:
      # - nginx sits in front and downgrades to HTTP/1.1 for upstream connections
      # - nginx open-source doesn't support HTTP/2 to backends (only nginx Plus does)
      # - Internal Docker network is fast enough that HTTP/2 gains are negligible
      # To use HTTP/2, either bypass nginx or use Granian with TLS directly.
      # - GRANIAN_HTTP=2

      # ═══════════════════════════════════════════════════════════════════════════
      # MCP Session Pool Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Session pooling for MCP ClientSessions reduces per-request overhead from
      # ~20ms to ~1-2ms (10-20x improvement). Sessions are isolated per user/tenant
      # via identity hashing to prevent cross-user session sharing.
      - MCP_SESSION_POOL_ENABLED=true              # Enable session pooling (default: false, enabled for docker-compose)
      - MCP_SESSION_POOL_MAX_PER_KEY=200           # Max sessions per (URL, identity, transport) - increased from 150 for 4000+ users
      - MCP_SESSION_POOL_TTL=300.0                 # Session TTL in seconds (default: 300)
      - MCP_SESSION_POOL_HEALTH_CHECK_INTERVAL=60.0  # Idle time before health check (default: 60)
      - MCP_SESSION_POOL_ACQUIRE_TIMEOUT=60.0     # Timeout waiting for session slot (default: 30)
      - MCP_SESSION_POOL_CREATE_TIMEOUT=30.0       # Timeout creating new session (default: 30)
      - MCP_SESSION_POOL_CIRCUIT_BREAKER_THRESHOLD=5  # Failures before circuit opens
      - MCP_SESSION_POOL_CIRCUIT_BREAKER_RESET=60.0   # Seconds before circuit resets
      - MCP_SESSION_POOL_IDLE_EVICTION=600.0      # Evict idle pool keys after (default: 600)
      - MCP_SESSION_POOL_TRANSPORT_TIMEOUT=30.0   # Timeout for all HTTP operations (default: 30)
      - MCP_SESSION_POOL_EXPLICIT_HEALTH_RPC=false  # Force RPC on health checks (default: false)
      # Configurable health check chain - ordered list of methods to try (JSON array)
      # Options: ping, list_tools, list_prompts, list_resources, skip
      # - MCP_SESSION_POOL_HEALTH_CHECK_METHODS=["ping", "skip"]  # Try ping, skip if unsupported
      - MCP_SESSION_POOL_HEALTH_CHECK_METHODS=["skip"]  # skip, highest performance
      - MCP_SESSION_POOL_HEALTH_CHECK_TIMEOUT=5.0               # Timeout per health check attempt
      # ═══════════════════════════════════════════════════════════════════════════
      # CPU Spin Loop Mitigation (Issue #2360, anyio#695)
      # ═══════════════════════════════════════════════════════════════════════════
      # These settings mitigate CPU spin loops that can occur when SSE/MCP connections
      # are cancelled and internal tasks don't respond to CancelledError. The spin
      # happens in anyio's _deliver_cancellation method.
      #
      # See documentation: docs/docs/operations/cpu-spin-loop-mitigation.md
      # GitHub Issue: https://github.com/IBM/mcp-context-forge/issues/2360
      # Upstream Issue: https://github.com/agronholm/anyio/issues/695
      #
      # ─────────────────────────────────────────────────────────────────────────
      # Layer 1: SSE Connection Protection
      # ─────────────────────────────────────────────────────────────────────────
      # Detect and close dead SSE connections before they cause spin loops.
      - SSE_SEND_TIMEOUT=30.0                                   # ASGI send() timeout (default: 30.0)
      - SSE_RAPID_YIELD_WINDOW_MS=1000                          # Detection window in ms (default: 1000)
      - SSE_RAPID_YIELD_MAX=50                                  # Max yields before disconnect (default: 50, 0=disabled)
      # ─────────────────────────────────────────────────────────────────────────
      # Layer 2: Cleanup Timeouts
      # ─────────────────────────────────────────────────────────────────────────
      # Limit how long cleanup waits for stuck tasks. Short timeouts (0.5s) reduce
      # CPU waste during cancelled connection cleanup. Only affects cleanup, not
      # normal operation.
      - MCP_SESSION_POOL_CLEANUP_TIMEOUT=0.5                    # Session __aexit__ timeout (default: 5.0)
      - SSE_TASK_GROUP_CLEANUP_TIMEOUT=0.5                      # SSE task group timeout (default: 5.0)
      # ─────────────────────────────────────────────────────────────────────────
      # Layer 3: EXPERIMENTAL - anyio Monkey-Patch
      # ─────────────────────────────────────────────────────────────────────────
      # Last resort: patches anyio to limit _deliver_cancellation iterations.
      # Enable only if Layers 1-2 don't fully resolve the issue.
      # WARNING: May be removed when anyio/MCP SDK fix upstream issue.
      - ANYIO_CANCEL_DELIVERY_PATCH_ENABLED=true              # Enable workaround - TESTING
      - ANYIO_CANCEL_DELIVERY_MAX_ITERATIONS=500              # Max iterations before giving up (~60ms recovery)

      # ═══════════════════════════════════════════════════════════════════════════
      # Execution Metrics Recording
      # ═══════════════════════════════════════════════════════════════════════════
      # Controls tool/resource/prompt/server/A2A execution metrics (one DB row per operation).
      # Disable when using external observability to improve performance.
      # Set to true if you need per-operation metrics in the database.
      # Note: Does NOT affect log aggregation (METRICS_AGGREGATION_ENABLED) or Prometheus.
      - DB_METRICS_RECORDING_ENABLED=true

      # ═══════════════════════════════════════════════════════════════════════════
      # Metrics Configuration
      # ═══════════════════════════════════════════════════════════════════════════
      # Raw metrics are deleted after hourly rollups exist (default: 1 hour retention).
      # Rollups preserve all analytics (counts, p50/p95/p99) for 365 days.
      #
      # If using external observability (ELK, Datadog, Splunk), raw metrics are
      # redundant - your external platform handles debugging and audit trails.
      #
      # Configurable settings (uncomment to override defaults):
      # - METRICS_DELETE_RAW_AFTER_ROLLUP=true      # Delete raw after rollup (default)
      # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=1    # Raw retention when rollup exists
      # - METRICS_CLEANUP_INTERVAL_HOURS=1          # Cleanup frequency (default: hourly)
      # - METRICS_RETENTION_DAYS=7                  # Fallback retention (rollup disabled)
      #
      # For debugging without external observability, increase raw retention:
      # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=168  # Keep raw data 7 days

      # Phoenix Observability Integration (uncomment when using Phoenix)
      # - PHOENIX_ENDPOINT=${PHOENIX_ENDPOINT:-http://phoenix:6006}
      # - OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:-http://phoenix:4317}
      # - OTEL_SERVICE_NAME=${OTEL_SERVICE_NAME:-mcp-gateway}
      # - OTEL_TRACES_EXPORTER=${OTEL_TRACES_EXPORTER:-otlp}
      # - OTEL_METRICS_EXPORTER=${OTEL_METRICS_EXPORTER:-otlp}
      # - OTEL_RESOURCE_ATTRIBUTES=${OTEL_RESOURCE_ATTRIBUTES:-deployment.environment=docker,service.namespace=mcp}

    # TCP kernel tuning for high-concurrency MCP tool invocations
    # Each tool call creates a new connection → many TIME_WAIT sockets
    sysctls:
      - net.ipv4.tcp_fin_timeout=15          # Faster cleanup of FIN_WAIT2 sockets (default: 60)
      - net.ipv4.ip_local_port_range=1024 65535  # More ephemeral ports (default: 32768-60999)
    ulimits:
      nofile:
        soft: 65535
        hard: 65535

    depends_on:          # Default stack: PgBouncer + Redis (PgBouncer depends on Postgres)
      pgbouncer:
        condition: service_healthy   # ▶ wait for connection pooler
      redis:
        condition: service_started
      # Direct PostgreSQL (uncomment if bypassing PgBouncer):
      # postgres:
      #   condition: service_healthy
      # migration:
      #   condition: service_completed_successfully

    healthcheck:
      ## Uncomment for HTTP healthcheck
      test: ["CMD", "python3", "-c", "import urllib.request; import json; resp = urllib.request.urlopen('http://localhost:4444/health', timeout=5); data = json.loads(resp.read()); exit(0 if data.get('status') == 'healthy' else 1)"]
      ## Uncomment for HTTPS healthcheck (requires valid SSL cert)
      # test: ["CMD", "curl", "-f", "https://localhost:4444/health"]
      # HTTPS healthcheck with SSL validation skipped (self-signed certs)
      # test: ["CMD", "curl", "-fk", "https://localhost:4444/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s

    # Scaling options:
    # - Single instance: use port 4444 directly, replicas: 1
    # - Multi-instance: comment out ports, set replicas: 2+, access via nginx:8080

    # ──────────────────────────────────────────────────────────────────────
    # Server Engine Selection (Default: Granian - Rust-based HTTP server)
    # ──────────────────────────────────────────────────────────────────────
    # Default is Granian. For Gunicorn with Uvicorn workers:
    # command: ["./run-gunicorn.sh"]

    deploy:
      mode: replicated
      replicas: 3
      resources:
        limits:
          cpus: '8'
          memory: 8G
        reservations:
          cpus: '4'
          memory: 4G

    # ──────────────────────────────────────────────────────────────────────
    # Volume Mounts
    # ──────────────────────────────────────────────────────────────────────
    # Uncomment to mount catalog configuration and SSL certificates
    # volumes:
    #   - ./mcp-catalog.yml:/app/mcp-catalog.yml:ro # mount catalog configuration
      # - ./certs:/app/certs:ro   # mount certs folder read-only (includes both SSL and JWT keys)
    #
    # SSL/TLS Certificate Setup:
    # 1. Generate certificates:
    #    - Without passphrase: make certs
    #    - With passphrase: make certs-passphrase
    # 2. Uncomment the volumes mount above
    # 3. Set SSL environment variables
    # 4. If using passphrase-protected key, set KEY_FILE_PASSWORD in .env file
    #
    # For JWT asymmetric keys:
    # 1. Generate keys: make certs-jwt
    # 2. Uncomment volumes mount above
    # 3. Switch JWT_ALGORITHM to RS256 and uncomment JWT_*_KEY_PATH variables

###############################################################################
#  DATABASES - enable ONE of these blocks and adjust DATABASE_URL
###############################################################################

  postgres:
    image: postgres:18
    shm_size: 256m  # Increase from 64MB default to prevent shared memory exhaustion under load
    ulimits:
      nofile:
        soft: 8192
        hard: 8192
    ports:
      - "5433:5432"      # Expose for baseline load testing (5433 to avoid conflict with local postgres)
    # Performance tuning for high-load testing (3000 sustained users)
    # WITH PgBouncer (default): 800 connections provides headroom for 700 pool + system overhead
    # DIRECT connection mode: increase to 4000 for (3 replicas × 16 workers × 80 pool)
    command:
      - "postgres"
      - "-c"
      - "max_connections=800"     # Must exceed PgBouncer MAX_DB_CONNECTIONS (700) + overhead
      - "-c"
      - "shared_buffers=512MB"
      - "-c"
      - "work_mem=16MB"
      - "-c"
      - "effective_cache_size=1536MB"
      - "-c"
      - "maintenance_work_mem=128MB"
      - "-c"
      - "checkpoint_completion_target=0.9"
      - "-c"
      - "wal_buffers=16MB"
      - "-c"
      - "random_page_cost=1.1"
      - "-c"
      - "effective_io_concurrency=200"
      - "-c"
      - "max_worker_processes=8"           # Total background workers (must be >= max_parallel_workers)
      - "-c"
      - "max_parallel_workers_per_gather=4" # Max workers per query's parallel operation
      - "-c"
      - "max_parallel_workers=8"            # Total parallel workers available system-wide
      # === HIGH-CONCURRENCY TUNING (3000 users) ===
      # CRITICAL: idle_in_transaction_session_timeout prevents connection starvation
      # Application code now properly closes transactions via get_db() commit-on-success pattern
      # This timeout is a safety net for any edge cases
      - "-c"
      - "idle_in_transaction_session_timeout=300s"  # Kill stuck transactions after 300s (aligned with PgBouncer)
      - "-c"
      - "statement_timeout=120s"           # Kill runaway queries after 120s
      - "-c"
      - "synchronous_commit=off"           # Async WAL writes (2-10x faster commits)
      - "-c"
      - "commit_delay=100"                 # Batch commits within 100μs window
      # ═══════════════════════════════════════════════════════════════════════════
      # AUTOVACUUM TUNING - High-insert workloads (metrics tables)
      # ═══════════════════════════════════════════════════════════════════════════
      # High insert rates cause dead tuple accumulation. These settings help
      # PostgreSQL keep up with table bloat from metrics writes.
      # Uncomment if experiencing performance degradation under sustained load:
      # - "-c"
      # - "autovacuum_naptime=30s"              # Check more frequently (default: 60s)
      # - "-c"
      # - "autovacuum_vacuum_scale_factor=0.05" # Vacuum at 5% dead tuples (default: 0.2)
      # - "-c"
      # - "autovacuum_vacuum_cost_limit=1000"   # More vacuum work per cycle (default: 200)
      # === PG_STAT_STATEMENTS + AUTO_EXPLAIN ===
      # Query performance tracking and slow query plan logging
      # NOTE: Both extensions must be in the SAME shared_preload_libraries line!
      # After enabling, run in psql:
      #   CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
      #   SELECT * FROM pg_stat_statements ORDER BY total_exec_time DESC LIMIT 10;
      # - "-c"
      # - "shared_preload_libraries=pg_stat_statements"
      # - "shared_preload_libraries=pg_stat_statements,auto_explain"  # Use this line to enable both
      # - "-c"
      # - "pg_stat_statements.track=all"
      # - "-c"
      #- "pg_stat_statements.max=10000"
      # AUTO_EXPLAIN settings (uncomment if using combined shared_preload_libraries above)
      # - "-c"
      # - "auto_explain.log_min_duration=1000"
      # - "-c"
      # - "auto_explain.log_analyze=on"
      # === ROLLBACK DEBUGGING (disabled for performance) ===
      # - "-c"
      # - "log_min_error_statement=error"
      # - "-c"
      # - "log_min_messages=warning"
      # - "-c"
      # - "log_error_verbosity=verbose"
      # - "-c"
      # - "log_line_prefix=%t [%p]: user=%u,db=%d,app=%a,client=%h "
      # - "-c"
      # - "log_lock_waits=on"
      # - "-c"
      # - "deadlock_timeout=1s"
      # - "-c"
      # - "log_temp_files=0"
      # - "-c"
      # - "log_checkpoints=on"
      # - "-c"
      # - "log_connections=on"
      # - "-c"
      # - "log_disconnections=on"
      # - "-c"
      # - "idle_in_transaction_session_timeout=60s"
    environment:
      - POSTGRES_USER=postgres
      - POSTGRES_PASSWORD=mysecretpassword
      - POSTGRES_DB=mcp
    volumes:
      # - pgdata:/var/lib/postgresql/data
      - pgdata:/var/lib/postgresql  # Enable for postgres 18+
    networks: [mcpnet]
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER"]
      interval: 30s
      timeout: 5s
      retries: 5
      start_period: 20s
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          cpus: '2'
          memory: 2G

  # ──────────────────────────────────────────────────────────────────────
  # PgBouncer - Connection Pooler for PostgreSQL
  # Reduces connection overhead, improves throughput under high concurrency.
  # Enable by switching gateway DATABASE_URL to use pgbouncer:6432 instead of postgres:5432
  # ──────────────────────────────────────────────────────────────────────
  pgbouncer:
    image: edoburu/pgbouncer:latest
    restart: unless-stopped
    networks: [mcpnet]
    ulimits:
      nofile:
        soft: 65536
        hard: 65536
    ports:
      - "6432:6432"    # PgBouncer port (optional external access)
    environment:
      # Connection to upstream PostgreSQL
      - DATABASE_URL=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
      # PgBouncer listen port (default would be 5432, using 6432 to distinguish from PostgreSQL)
      - LISTEN_PORT=6432
      # Pool mode: transaction (recommended), session, or statement
      # transaction: connection returned after each transaction (best for web apps)
      - POOL_MODE=transaction
      # ═══════════════════════════════════════════════════════════════════════════
      # Connection Pool Tuning for 3000 Sustained Users
      # PgBouncer handles connection multiplexing - many app connections share fewer DB connections
      # ═══════════════════════════════════════════════════════════════════════════
      # Client-side limits (from gateway workers via SQLAlchemy)
      - MAX_CLIENT_CONN=5000           # Max app connections; must exceed (replicas × workers × pool)
      - DEFAULT_POOL_SIZE=600          # Shared DB connections; sized for ~70 concurrent tx × 8x headroom
      - MIN_POOL_SIZE=100              # Pre-warmed connections for instant response to load spikes
      - RESERVE_POOL_SIZE=150          # Emergency pool for burst traffic beyond DEFAULT_POOL_SIZE
      - RESERVE_POOL_TIMEOUT=2         # Seconds before tapping reserve pool
      # Server-side limits (to PostgreSQL)
      - MAX_DB_CONNECTIONS=700         # Max connections to PostgreSQL; must be < PG max_connections
      - MAX_USER_CONNECTIONS=700       # Per-user limit; typically equals MAX_DB_CONNECTIONS
      # Connection lifecycle
      - SERVER_LIFETIME=3600           # Recycle server connections after 1 hour (prevents stale state)
      - SERVER_IDLE_TIMEOUT=600        # Close unused server connections after 10 min
      # Timeout settings
      - QUERY_WAIT_TIMEOUT=60          # Max wait for available connection before failing request
      - CLIENT_IDLE_TIMEOUT=60         # Close idle client connections (aligned with IDLE_TRANSACTION_TIMEOUT)
      - SERVER_CONNECT_TIMEOUT=5       # Timeout for new connections to PostgreSQL
      # Transaction cleanup - critical for avoiding idle-in-transaction buildup
      # NOTE: In transaction pooling, session-level advisory locks (used by migrations)
      # can stick unless the reset query clears them; DISCARD ALL is safest.
      - SERVER_RESET_QUERY=DISCARD ALL # Reset connection state when returned to pool
      - SERVER_RESET_QUERY_ALWAYS=1    # Always run reset query even after clean transactions
      - IDLE_TRANSACTION_TIMEOUT=30    # Kill transactions idle > 30s to prevent connection pool exhaustion
      # Authentication
      - AUTH_TYPE=scram-sha-256        # Match PostgreSQL auth method
    depends_on:
      postgres:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "pg_isready", "-h", "localhost", "-p", "6432"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '1'
          memory: 256M
        reservations:
          cpus: '0.5'
          memory: 128M

  # migration:
  #   #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0
  #   image: mcpgateway/mcpgateway:latest # Use the local latest image. Run `make docker-prod` to build it.
  #   build:
  #     context: .
  #     dockerfile: Containerfile
  #   environment:
  #     - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp
  #   command: alembic -c mcpgateway/alembic.ini upgrade head
  #   depends_on:
  #     postgres:
  #       condition: service_healthy
  #   networks: [mcpnet]

###############################################################################
#  CACHE
###############################################################################
  redis:
    image: redis:latest
    ulimits:
      nofile:
        soft: 65536
        hard: 65536
    # Performance tuning for 1000+ RPS high-concurrency load testing
    command:
      - "redis-server"
      - "--maxmemory"
      - "1gb"
      - "--maxmemory-policy"
      - "allkeys-lru"
      - "--tcp-backlog"
      - "2048"
      - "--timeout"
      - "0"
      - "--tcp-keepalive"
      - "300"
      - "--maxclients"
      - "10000"
    ports:
      - "6379:6379"      # expose only if you want host access
    networks: [mcpnet]
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 2G
        reservations:
          cpus: '1'
          memory: 1G

###############################################################################
#  MONITORING STACK (enabled with --profile monitoring)
#  Usage: docker compose --profile monitoring up -d
#  Access: Grafana http://localhost:3000 (admin/changeme)
#          Prometheus http://localhost:9090
###############################################################################

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus PostgreSQL Exporter - Database metrics
  # Metrics: connections, query duration, locks, cache hit ratio
  # ──────────────────────────────────────────────────────────────────────
  postgres_exporter:
    image: quay.io/prometheuscommunity/postgres-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9187:9187"    # http://localhost:9187/metrics
    environment:
      - DATA_SOURCE_NAME=postgresql://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp?sslmode=disable
      - PG_EXPORTER_AUTO_DISCOVER_DATABASES=true
    depends_on:
      postgres:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus Redis Exporter - Cache metrics
  # Metrics: memory, clients, commands/sec, keyspace stats
  # ──────────────────────────────────────────────────────────────────────
  redis_exporter:
    image: oliver006/redis_exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9121:9121"    # http://localhost:9121/metrics
    environment:
      - REDIS_ADDR=redis://redis:6379
    depends_on:
      redis:
        condition: service_started
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus PgBouncer Exporter - Connection pool metrics
  # Metrics: active/waiting clients, server connections, pool stats
  # ──────────────────────────────────────────────────────────────────────
  pgbouncer_exporter:
    image: prometheuscommunity/pgbouncer-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9127:9127"    # http://localhost:9127/metrics
    environment:
      - PGBOUNCER_EXPORTER_CONNECTION_STRING=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/pgbouncer?sslmode=disable
    depends_on:
      pgbouncer:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus Nginx Exporter - Proxy metrics
  # Metrics: active connections, requests/sec, response codes
  # Requires stub_status enabled in nginx.conf (location /nginx_status)
  # ──────────────────────────────────────────────────────────────────────
  nginx_exporter:
    image: nginx/nginx-prometheus-exporter:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9113:9113"    # http://localhost:9113/metrics
    command:
      - '-nginx.scrape-uri=http://nginx:80/nginx_status'
    depends_on:
      nginx:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # cAdvisor - Container metrics (CPU, memory, network, disk I/O)
  # Metrics: container_cpu_usage_seconds_total, container_memory_usage_bytes
  # Dashboard: Grafana ID 14282 (Docker and cAdvisor)
  # ──────────────────────────────────────────────────────────────────────
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "8085:8080"    # http://localhost:8085/metrics
    privileged: true
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Prometheus - Metrics collection and storage
  # Scrapes: gateway, postgres, redis, nginx, cadvisor
  # Retention: 7 days (configurable via --storage.tsdb.retention.time)
  # ──────────────────────────────────────────────────────────────────────
  prometheus:
    image: prom/prometheus:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "9090:9090"    # http://localhost:9090
    volumes:
      - ./infra/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheusdata:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=7d'
      - '--web.enable-lifecycle'
    depends_on:
      - postgres_exporter
      - redis_exporter
      - nginx_exporter
      - cadvisor
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Loki - Log aggregation system (like Prometheus, but for logs)
  # Query logs with LogQL in Grafana
  # ──────────────────────────────────────────────────────────────────────
  loki:
    image: grafana/loki:latest
    restart: unless-stopped
    networks: [mcpnet]
    user: "0"  # Run as root to avoid permission issues
    ports:
      - "3100:3100"    # http://localhost:3100/ready
    volumes:
      - ./infra/monitoring/loki/loki-config.yaml:/etc/loki/local-config.yaml:ro
      - lokidata:/loki
    command: -config.file=/etc/loki/local-config.yaml
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Promtail - Log collector for Loki
  # Collects logs from all containers via Docker socket
  # ──────────────────────────────────────────────────────────────────────
  promtail:
    image: grafana/promtail:latest
    restart: unless-stopped
    networks: [mcpnet]
    volumes:
      - ./infra/monitoring/loki/promtail-config.yaml:/etc/promtail/config.yaml:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    command: -config.file=/etc/promtail/config.yaml
    depends_on:
      - loki
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Grafana - Dashboard visualization
  # Default login: admin / changeme
  # Recommended dashboards:
  #   - Docker/cAdvisor: 14282
  #   - PostgreSQL: 9628
  #   - Redis: 763
  #   - Nginx: 12708
  # ──────────────────────────────────────────────────────────────────────
  grafana:
    image: grafana/grafana:latest
    restart: unless-stopped
    networks: [mcpnet]
    user: "0"  # Run as root to avoid permission issues with provisioning
    ports:
      - "3000:3000"    # http://localhost:3000
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=changeme
      - GF_USERS_ALLOW_SIGN_UP=false
    volumes:
      - grafanadata:/var/lib/grafana
      - ./infra/monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
      - ./infra/monitoring/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
    depends_on:
      - prometheus
    profiles: ["monitoring"]

###############################################################################
#  OPTIONAL ADMIN TOOLS - handy web UIs for DB & cache (disabled by default)
###############################################################################
  pgadmin:              # 🔧 Postgres admin UI
    image: dpage/pgadmin4:9.11.0
    environment:
      - PGADMIN_DEFAULT_EMAIL=admin@example.com
      - PGADMIN_DEFAULT_PASSWORD=changeme
    ports:
      - "5050:80"      # http://localhost:5050
    volumes:
      - pgadmindata:/var/lib/pgadmin
    networks: [mcpnet]
    depends_on:
      postgres:
        condition: service_healthy
    profiles: ["monitoring"]

  # ──────────────────────────────────────────────────────────────────────
  # Redis Commander - a web-based Redis GUI
  # ──────────────────────────────────────────────────────────────────────
  redis_commander:       # 🔧 Redis key browser
    image: rediscommander/redis-commander:latest
    restart: unless-stopped
    networks: [mcpnet]
    depends_on:
      redis:
        condition: service_started
    ports:
      - "8081:8081"    # http://localhost:8081
    environment:
      - REDIS_HOSTS=local:redis:6379
      - HTTP_USER=admin
      - HTTP_PASSWORD=changeme
    profiles: ["monitoring"]

  # # ──────────────────────────────────────────────────────────────────────
  # # Redis Insight - a powerful Redis GUI (recently updated)
  # # ──────────────────────────────────────────────────────────────────────
  # redis_insight:                    # 🔧 Redis Insight GUI
  #   image: redis/redisinsight:latest
  #   container_name: redisinsight
  #   restart: unless-stopped
  #   networks: [mcpnet]
  #   ports:
  #     - "5540:5540"                 # Redis Insight UI (default 5540)
  #   depends_on:          # Default stack: Postgres + Redis
  #     redis:
  #       condition: service_started

  #   # ──────────────────────────────────────────────────────────────────────
  #   # Persist data (config, logs, history) between restarts
  #   # ──────────────────────────────────────────────────────────────────────
  #   # volumes:
  #   #   - ./redisinsight_data:/data
  #   volumes:
  #     - redisinsight_data:/data  # <- persist data in named volume

  #   # ──────────────────────────────────────────────────────────────────────
  #   # Preconfigure Redis connection(s) via env vars
  #   # ──────────────────────────────────────────────────────────────────────
  #   environment:
  #     # Single connection (omit "*" since only one):
  #     - RI_REDIS_HOST=redis         # <- your Redis hostname
  #     - RI_REDIS_PORT=6379          # <- your Redis port
  #     - RI_REDIS_USERNAME=default   # <- ACL/username (Redis 6+)
  #     #- RI_REDIS_PASSWORD=changeme  # <- Redis AUTH password
  #     #- RI_REDIS_TLS=true           # <- enable TLS

  #     # Optional: validate self-signed CA instead of trusting all:
  #     # - RI_REDIS_TLS_CA_PATH=/certs/selfsigned.crt
  #     # - RI_REDIS_TLS_CERT_PATH=/certs/client.crt
  #     # - RI_REDIS_TLS_KEY_PATH=/certs/client.key
  #     # - RI_REDIS_TLS=true           # (already set above)

  #     # ──────────────────────────────────────────────────────────────────
  #     # Core Redis Insight settings
  #     # ──────────────────────────────────────────────────────────────────
  #     - RI_APP_HOST=0.0.0.0          # <- listen on all interfaces
  #     - RI_APP_PORT=5540             # <- UI port (container-side)


###############################################################################
#  OPTIONAL MCP SERVERS - drop-in helpers the Gateway can call
###############################################################################

  ###############################################################################
  # Fast Time Server - High-performance time/timezone service for MCP
  # Uses pre-built image by default. On ARM64, build locally:
  #   FAST_TIME_IMAGE=mcpgateway/fast-time-server:local docker compose build fast_time_server
  ###############################################################################
  fast_time_server:
    image: ${FAST_TIME_IMAGE:-ghcr.io/ibm/fast-time-server:latest}
    build:
      context: ./mcp-servers/go/fast-time-server
      dockerfile: Dockerfile
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "8888:8080"    # Map host port 8888 to container port 8080
    # Use dual mode for both SSE (/sse) and Streamable HTTP (/http) endpoints
    command: ["-transport=dual", "-listen=0.0.0.0", "-port=8080", "-log-level=info"]

  ###############################################################################
  # Auto-registration service - registers fast_time_server with gateway
  ###############################################################################
  register_fast_time:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest}
    networks: [mcpnet]
    depends_on:
      gateway:
        condition: service_healthy
      fast_time_server:
        condition: service_started
    environment:
      - JWT_SECRET_KEY=my-test-key
    # This is a one-shot container that exits after registration
    restart: "no"
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "Using latest gateway image with current JWT utility..."

        echo "Waiting for services to be ready..."

        # Wait for gateway to be ready using Python
        python3 -c "
        import time
        import urllib.request
        import urllib.error

        for i in range(1, 61):
            try:
                with urllib.request.urlopen('http://gateway:4444/health', timeout=2) as response:
                    if response.status == 200:
                        print('✅ gateway is healthy')
                        break
            except:
                pass
            print(f'Waiting for gateway... ({i}/60)')
            time.sleep(2)
        else:
            print('❌ Gateway failed to become healthy')
            exit(1)
        "

        # Wait for fast_time_server to be ready using Python
        python3 -c "
        import time
        import urllib.request
        import urllib.error

        for i in range(1, 31):
            try:
                with urllib.request.urlopen('http://fast_time_server:8080/health', timeout=2) as response:
                    if response.status == 200:
                        print('✅ fast_time_server is healthy')
                        break
            except:
                pass
            print(f'Waiting for fast_time_server... ({i}/30)')
            time.sleep(2)
        else:
            print('❌ Fast time server failed to become healthy')
            exit(1)
        "

        echo "Generating JWT token..."
        echo "Environment: JWT_SECRET_KEY=$$JWT_SECRET_KEY"
        echo "Running: python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256"
        # Only capture stdout (the token), let warnings go to stderr
        export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null)
        echo "Generated token: $$MCPGATEWAY_BEARER_TOKEN"

        # Decode the token to verify it has expiration
        echo "Decoding token to verify claims..."
        python3 -m mcpgateway.utils.create_jwt_token --decode "$$MCPGATEWAY_BEARER_TOKEN" 2>/dev/null || echo "Failed to decode token"

        # Test authentication first
        echo "Testing authentication..."

        # Use Python to make HTTP requests
        python3 -c "
        import urllib.request
        import urllib.error
        import json
        import sys
        import os
        import time

        token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '')

        def api_request(method, path, data=None):
            '''Helper to make authenticated API requests.'''
            url = f'http://gateway:4444{path}'
            req = urllib.request.Request(url, method=method)
            req.add_header('Authorization', f'Bearer {token}')
            req.add_header('Content-Type', 'application/json')
            if data:
                req.data = json.dumps(data).encode('utf-8')
            with urllib.request.urlopen(req) as response:
                return json.loads(response.read().decode('utf-8'))

        # Test version endpoint without auth
        print('Checking gateway config...')
        try:
            with urllib.request.urlopen('http://gateway:4444/version') as response:
                data = response.read().decode('utf-8')
                print(f'Gateway version response (no auth): {data[:200]}')
        except Exception as e:
            print(f'Version check failed: {e}')

        # Test version endpoint with auth
        print('Testing authentication...')
        try:
            req = urllib.request.Request('http://gateway:4444/version')
            req.add_header('Authorization', f'Bearer {token}')
            with urllib.request.urlopen(req) as response:
                data = response.read().decode('utf-8')
                print(f'Auth test response: SUCCESS')
                auth_success = True
        except Exception as e:
            print(f'Auth test response: FAILED - {e}')
            auth_success = False

        # Register fast_time_server with gateway using Streamable HTTP transport
        print('Registering fast_time_server with gateway (Streamable HTTP)...')

        # First check if gateway already exists and delete it
        gateway_id = None
        try:
            gateways = api_request('GET', '/gateways')
            for gw in gateways:
                if gw.get('name') == 'fast_time':
                    print(f'Found existing gateway {gw[\"id\"]}, deleting...')
                    api_request('DELETE', f'/gateways/{gw[\"id\"]}')
                    print('Deleted existing gateway')
        except Exception as e:
            print(f'Note: Could not check/delete existing gateway: {e}')

        # Delete existing virtual server if present (using fixed ID)
        VIRTUAL_SERVER_ID = '9779b6698cbd4b4995ee04a4fab38737'
        try:
            api_request('DELETE', f'/servers/{VIRTUAL_SERVER_ID}')
            print(f'Deleted existing virtual server {VIRTUAL_SERVER_ID}')
        except Exception as e:
            print(f'Note: No existing virtual server to delete (or error: {e})')

        # Register the gateway
        try:
            result = api_request('POST', '/gateways', {
                'name': 'fast_time',
                'url': 'http://fast_time_server:8080/http',
                'transport': 'STREAMABLEHTTP'
            })
            print(f'Registration response: {result}')
            if 'id' in result:
                gateway_id = result['id']
                print(f'✅ Successfully registered fast_time_server (gateway_id: {gateway_id})')
            else:
                print('❌ Registration failed - no ID in response')
                sys.exit(1)
        except Exception as e:
            print(f'❌ Registration failed: {e}')
            sys.exit(1)

        # Wait for tools to be synced from the gateway
        print('Waiting for tools/resources/prompts to sync...')
        for i in range(30):
            time.sleep(1)
            try:
                tools = api_request('GET', '/tools')
                # Filter tools from fast_time gateway (note: camelCase gatewayId)
                fast_time_tools = [t for t in tools if t.get('gatewayId') == gateway_id]
                if fast_time_tools:
                    print(f'Found {len(fast_time_tools)} tools from fast_time gateway')
                    break
            except Exception as e:
                pass
            print(f'Waiting for sync... ({i+1}/30)')
        else:
            print('⚠️ No tools synced, continuing anyway...')

        # Fetch all tools, resources, and prompts
        # Note: Tools use gatewayId (camelCase), resources/prompts from catalog have no gatewayId
        tool_ids = []
        resource_ids = []
        prompt_ids = []

        try:
            tools = api_request('GET', '/tools')
            # Get tools from the fast_time gateway
            tool_ids = [t['id'] for t in tools if t.get('gatewayId') == gateway_id]
            print(f'Found tools: {[t[\"name\"] for t in tools if t.get(\"gatewayId\") == gateway_id]}')
        except Exception as e:
            print(f'Failed to fetch tools: {e}')

        try:
            resources = api_request('GET', '/resources')
            # Include all resources (from catalog)
            resource_ids = [r['id'] for r in resources]
            print(f'Found resources: {[r[\"name\"] for r in resources]}')
        except Exception as e:
            print(f'Failed to fetch resources: {e}')

        try:
            prompts = api_request('GET', '/prompts')
            # Include all prompts (from catalog)
            prompt_ids = [p['id'] for p in prompts]
            print(f'Found prompts: {[p[\"name\"] for p in prompts]}')
        except Exception as e:
            print(f'Failed to fetch prompts: {e}')

        # Create virtual server with all tools, resources, and prompts
        print('Creating virtual server...')
        try:
            # API expects payload wrapped in 'server' key
            # Use fixed UUID for consistent server ID across restarts
            server_payload = {
                'server': {
                    'id': '9779b6698cbd4b4995ee04a4fab38737',
                    'name': 'Fast Time Server',
                    'description': 'Virtual server exposing Fast Time MCP tools, resources, and prompts',
                    'associated_tools': tool_ids,
                    'associated_resources': resource_ids,
                    'associated_prompts': prompt_ids
                }
            }
            result = api_request('POST', '/servers', server_payload)
            print(f'Virtual server created: {result}')
            print(f'✅ Successfully created virtual server with {len(tool_ids)} tools, {len(resource_ids)} resources, {len(prompt_ids)} prompts')
        except Exception as e:
            print(f'❌ Failed to create virtual server: {e}')
            sys.exit(1)
        "

        # Write the bearer token to a file for load testing
        echo "Writing bearer token to /tmp/gateway-token.txt..."
        echo "$$MCPGATEWAY_BEARER_TOKEN" > /tmp/gateway-token.txt
        echo "Token written to /tmp/gateway-token.txt"

        echo "✅ Setup complete!"

  ###############################################################################
  # Fast Test Server - Ultra-fast Rust MCP server for performance testing
  # Provides: echo, get_system_time, get_stats tools via MCP Streamable HTTP
  # Also exposes REST API endpoints for baseline comparison
  # Usage: docker compose --profile testing up -d
  ###############################################################################
  fast_test_server:
    build:
      context: ./mcp-servers/rust/fast-test-server
      dockerfile: Containerfile
    image: mcpgateway/fast-test-server:latest
    restart: unless-stopped
    networks: [mcpnet]
    ports:
      - "8880:8880"    # Port 8880 (avoids conflict with benchmark servers on 9000+)
    environment:
      - BIND_ADDRESS=0.0.0.0:8880
      - RUST_LOG=info
    # TCP kernel tuning for high-concurrency load testing
    sysctls:
      - net.ipv4.tcp_fin_timeout=15          # Faster cleanup of FIN_WAIT2 sockets
      - net.ipv4.ip_local_port_range=1024 65535  # More ephemeral ports
      - net.core.somaxconn=65535             # Max listen backlog
    ulimits:
      nofile:
        soft: 65535
        hard: 65535
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:8880/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 128M
    profiles: ["testing", "monitoring"]

  ###############################################################################
  # Auto-registration service - registers fast_test_server with gateway
  ###############################################################################
  register_fast_test:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest}
    networks: [mcpnet]
    depends_on:
      gateway:
        condition: service_healthy
      fast_test_server:
        condition: service_healthy
    environment:
      - JWT_SECRET_KEY=my-test-key
    restart: "no"
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "Registering fast_test_server with gateway..."

        # Generate JWT token
        export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null)

        # Register using Python
        python3 -c "
        import urllib.request
        import json
        import os
        import time

        token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '')

        def api_request(method, path, data=None):
            url = f'http://gateway:4444{path}'
            req = urllib.request.Request(url, method=method)
            req.add_header('Authorization', f'Bearer {token}')
            req.add_header('Content-Type', 'application/json')
            if data:
                req.data = json.dumps(data).encode('utf-8')
            with urllib.request.urlopen(req) as response:
                return json.loads(response.read().decode('utf-8'))

        # Delete existing gateway if present
        try:
            gateways = api_request('GET', '/gateways')
            for gw in gateways:
                if gw.get('name') == 'fast_test':
                    print(f'Deleting existing gateway {gw[\"id\"]}...')
                    api_request('DELETE', f'/gateways/{gw[\"id\"]}')
        except Exception as e:
            print(f'Note: {e}')

        # Register the gateway
        try:
            result = api_request('POST', '/gateways', {
                'name': 'fast_test',
                'url': 'http://fast_test_server:8880/mcp',
                'transport': 'STREAMABLEHTTP'
            })
            print(f'✅ Registered fast_test_server: {result.get(\"id\", \"unknown\")}')
        except Exception as e:
            print(f'❌ Registration failed: {e}')
            exit(1)
        "
        echo "✅ Registration complete!"
    profiles: ["testing", "monitoring"]

  ###############################################################################
  # Benchmark Server - Multi-server MCP benchmark tool
  # Spawns multiple lightweight MCP servers for load testing
  # Usage: make benchmark-up (or: docker compose --profile benchmark up -d)
  #
  # Environment variables:
  #   BENCHMARK_SERVER_COUNT - Number of servers to spawn (default: 10)
  #   BENCHMARK_START_PORT   - Starting port number (default: 9000)
  ###############################################################################
  benchmark_server:
    build:
      context: ./mcp-servers/go/benchmark-server
      dockerfile: Dockerfile
    image: mcpgateway/benchmark-server:latest
    restart: unless-stopped
    networks: [mcpnet]
    command:
      - "-transport=http"
      - "-server-count=${BENCHMARK_SERVER_COUNT:-10}"
      - "-start-port=${BENCHMARK_START_PORT:-9000}"
      - "-tools=50"
      - "-resources=20"
      - "-prompts=10"
    ports:
      # Port range supports up to 100 servers (9000-9099)
      # Actual servers spawned controlled by BENCHMARK_SERVER_COUNT
      - "9000-9099:9000-9099"
    # Note: No healthcheck - scratch-based Go image has no shell
    # Verify health via: curl http://localhost:9000/health
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 256M
    profiles: ["benchmark"]

  ###############################################################################
  # Auto-registration service - registers benchmark servers with gateway
  # Uses BENCHMARK_SERVER_COUNT and BENCHMARK_START_PORT environment variables
  ###############################################################################
  register_benchmark:
    image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest}
    networks: [mcpnet]
    depends_on:
      gateway:
        condition: service_healthy
      benchmark_server:
        condition: service_started
    environment:
      - JWT_SECRET_KEY=my-test-key
      - BENCHMARK_SERVER_COUNT=${BENCHMARK_SERVER_COUNT:-10}
      - BENCHMARK_START_PORT=${BENCHMARK_START_PORT:-9000}
    restart: "no"
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "Registering benchmark servers with gateway..."

        # Wait for benchmark servers to start (no healthcheck available)
        echo "Waiting for benchmark servers to start..."
        sleep 5

        # Generate JWT token
        export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null)

        # Register benchmark servers using environment variables
        python3 -c "
        import urllib.request
        import json
        import os

        token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '')
        server_count = int(os.environ.get('BENCHMARK_SERVER_COUNT', '10'))
        start_port = int(os.environ.get('BENCHMARK_START_PORT', '9000'))

        headers = {
            'Authorization': f'Bearer {token}',
            'Content-Type': 'application/json'
        }

        def api_request(method, path, data=None):
            url = f'http://gateway:4444{path}'
            body = json.dumps(data).encode() if data else None
            req = urllib.request.Request(url, data=body, headers=headers, method=method)
            with urllib.request.urlopen(req, timeout=30) as resp:
                return json.loads(resp.read().decode())

        # Register benchmark servers
        print(f'Registering {server_count} benchmark servers (ports {start_port}-{start_port + server_count - 1})...')
        registered = 0
        for port in range(start_port, start_port + server_count):
            name = f'benchmark-{port}'
            try:
                result = api_request('POST', '/gateways', {
                    'name': name,
                    'url': f'http://benchmark_server:{port}/mcp',
                    'transport': 'STREAMABLEHTTP'
                })
                print(f'✅ Registered {name}: {result.get(\"id\", \"unknown\")}')
                registered += 1
            except urllib.error.HTTPError as e:
                if e.code == 409:
                    print(f'⏭️  {name} already registered')
                    registered += 1
                else:
                    print(f'❌ Failed to register {name}: HTTP {e.code}')
            except Exception as e:
                print(f'❌ Failed to register {name}: {e}')

        print(f'✅ Registration complete: {registered}/{server_count} benchmark servers')
        "
    profiles: ["benchmark"]

###############################################################################
#  TLS PROFILE - Zero-config HTTPS via Nginx (enabled with --profile tls)
#  Usage: make compose-tls  (or: docker compose --profile tls up -d)
#
#  Features:
#  - Auto-generates self-signed certificates on first run
#  - Supports custom certificates (CA-signed or your own)
#  - Supports passphrase-protected keys (auto-decrypted for nginx)
#  - HTTPS on port 8443, HTTP on port 8080 (both available)
#  - Compatible with other profiles: --profile tls --profile monitoring
#
#  ═══════════════════════════════════════════════════════════════════════════
#  Bringing Your Own Certificates
#  ═══════════════════════════════════════════════════════════════════════════
#
#  Option 1: Unencrypted Private Key (no passphrase)
#  ───────────────────────────────────────────────────────────────────────────
#    mkdir -p certs
#    cp /path/to/your/certificate.pem certs/cert.pem
#    cp /path/to/your/private-key.pem certs/key.pem
#    make compose-tls
#
#  Option 2: Passphrase-Protected Private Key
#  ───────────────────────────────────────────────────────────────────────────
#    mkdir -p certs
#    cp /path/to/your/certificate.pem certs/cert.pem
#    cp /path/to/your/encrypted-key.pem certs/key-encrypted.pem
#    echo "KEY_FILE_PASSWORD=your-passphrase" >> .env
#    make compose-tls
#
#  The cert_init service will automatically decrypt key-encrypted.pem to
#  key.pem for nginx (nginx doesn't support passphrase-protected keys).
#
#  Option 3: Generate Self-Signed with Passphrase
#  ───────────────────────────────────────────────────────────────────────────
#    make certs-passphrase                         # Generates cert + key-encrypted.pem
#    echo "KEY_FILE_PASSWORD=your-passphrase" >> .env
#    make compose-tls                              # Auto-decrypts for nginx
###############################################################################

  # ──────────────────────────────────────────────────────────────────────
  # Certificate Initialization - Auto-generates self-signed certs if missing
  # Supports passphrase-protected keys via KEY_FILE_PASSWORD
  # ──────────────────────────────────────────────────────────────────────
  cert_init:
    image: alpine/openssl:latest
    volumes:
      - ./certs:/certs
    environment:
      - KEY_FILE_PASSWORD=${KEY_FILE_PASSWORD:-}
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        # Check if we have an encrypted key that needs decryption
        if [ -f /certs/key-encrypted.pem ] && [ -n "${KEY_FILE_PASSWORD}" ]; then
          # Validate: encrypted key requires matching certificate
          if [ ! -f /certs/cert.pem ]; then
            echo "❌ Found key-encrypted.pem but cert.pem is missing"
            echo "   Please provide both files: cert.pem and key-encrypted.pem"
            exit 1
          fi
          echo "🔓 Decrypting passphrase-protected key for nginx..."
          if [ -f /certs/key.pem ]; then
            echo "⚠️  Overwriting existing key.pem with decrypted version"
          fi
          # Decrypt the key for nginx (nginx doesn't support passphrase-protected keys)
          # Using env: prefix to avoid exposing password in process listing
          openssl rsa -in /certs/key-encrypted.pem -out /certs/key.pem -passin env:KEY_FILE_PASSWORD
          if [ $? -eq 0 ]; then
            chmod 640 /certs/key.pem
            echo "✅ Successfully decrypted key-encrypted.pem to key.pem"
          else
            echo "❌ Failed to decrypt key-encrypted.pem - check KEY_FILE_PASSWORD"
            exit 1
          fi
        fi

        # Check if we already have unencrypted certs
        if [ -f /certs/cert.pem ] && [ -f /certs/key.pem ]; then
          echo "✅ Certificates found in ./certs - using existing"
          exit 0
        fi

        # Generate new self-signed certificate (without passphrase for nginx)
        echo "🔏 Generating self-signed TLS certificate..."
        mkdir -p /certs
        openssl req -x509 -newkey rsa:4096 -sha256 -days 365 -nodes \
          -keyout /certs/key.pem -out /certs/cert.pem \
          -subj "/CN=localhost" \
          -addext "subjectAltName=DNS:localhost,DNS:gateway,DNS:nginx,IP:127.0.0.1"
        chmod 644 /certs/cert.pem
        chmod 640 /certs/key.pem
        echo "✅ TLS certificate generated in ./certs"
    profiles: ["tls"]

  # ──────────────────────────────────────────────────────────────────────
  # Nginx TLS - HTTPS-enabled reverse proxy (overrides default nginx)
  # ──────────────────────────────────────────────────────────────────────
  nginx_tls:
    build:
      context: ./infra/nginx
      dockerfile: Dockerfile
    image: mcpgateway/nginx-cache:latest
    restart: unless-stopped
    ports:
      - "8080:80"                   # HTTP caching proxy (public-facing)
      - "8443:443"                  # HTTPS caching proxy (public-facing)
    networks: [mcpnet]
    environment:
      # Set to "true" to force all HTTP requests to redirect to HTTPS
      - NGINX_FORCE_HTTPS=${NGINX_FORCE_HTTPS:-false}
    depends_on:
      gateway:
        condition: service_healthy
      cert_init:
        condition: service_completed_successfully
    volumes:
      - nginx_cache:/var/cache/nginx                         # Persistent cache storage
      - ./infra/nginx/nginx-tls.conf:/etc/nginx/nginx.conf:ro  # TLS-enabled config
      - ./certs:/app/certs:ro                                 # Mount SSL certs
    # TCP kernel tuning for 3000 concurrent connections
    sysctls:
      - net.ipv4.tcp_fin_timeout=15
      - net.ipv4.ip_local_port_range=1024 65535
    ulimits:
      nofile:
        soft: 65535
        hard: 65535
    healthcheck:
      test: ["CMD", "curl", "-fk", "https://localhost/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 1G
        reservations:
          cpus: '2'
          memory: 512M
    profiles: ["tls"]