#version: "3.9" # Supported by both podman-compose and Docker Compose v2+ ############################################################################### # HOST SYSTEM TUNING FOR LOAD TESTING (run before docker compose up) # See docs/docs/testing/performance.md for full details # # One-liner (TCP + VM + I/O tuning): # sudo sysctl -w net.core.somaxconn=65535 net.core.netdev_max_backlog=65535 net.ipv4.tcp_max_syn_backlog=65535 net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=15 net.ipv4.ip_local_port_range="1024 65535" vm.swappiness=10 fs.aio-max-nr=1048576 # # Make persistent: sudo tee /etc/sysctl.d/99-mcp-loadtest.conf (see docs) ############################################################################### ############################################################################### # DOCKER COMPOSE PROFILES # # Default (no profile): Gateway + Postgres + Redis + Nginx (HTTP only) # --profile monitoring: Adds Prometheus, Grafana, Loki, exporters # --profile benchmark: Adds benchmark MCP servers for load testing # --profile tls: Enables HTTPS via nginx_tls (auto-generates certs) # # TLS Quick Start: # make compose-tls # HTTP:8080 + HTTPS:8443 # make compose-tls-https # Force HTTPS (HTTP redirects) # curl -sk https://localhost:8443/health # # Custom certificates: # mkdir -p certs && cp your-cert.pem certs/cert.pem && cp your-key.pem certs/key.pem # make compose-tls # # Environment variables (TLS profile): # NGINX_FORCE_HTTPS=true # Redirect all HTTP to HTTPS ############################################################################### ############################################################################### # NETWORKS + VOLUMES - declared first so they can be referenced later ############################################################################### networks: mcpnet: # Single user-defined bridge network keeps traffic private driver: bridge volumes: # Named volumes survive podman-compose down/up pgdata: # pgdata18: # Enable for postgres 18+ pgadmindata: redisinsight_data: nginx_cache: grafanadata: prometheusdata: lokidata: ############################################################################### # CORE SERVICE - MCP Gateway ############################################################################### services: # ────────────────────────────────────────────────────────────────────── # Nginx Caching Proxy - High-performance reverse proxy with CDN-like caching # ────────────────────────────────────────────────────────────────────── nginx: build: context: ./infra/nginx dockerfile: Dockerfile image: mcpgateway/nginx-cache:latest restart: unless-stopped ports: - "8080:80" # HTTP caching proxy (public-facing) # - "8443:443" # HTTPS caching proxy (public-facing) networks: [mcpnet] depends_on: gateway: condition: service_healthy volumes: - nginx_cache:/var/cache/nginx # Persistent cache storage - ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro # Mount config as read-only # - ./certs:/app/certs:ro # Mount SSL certs for HTTPS backend verification # TCP kernel tuning for 3000 concurrent connections # Note: net.core.* sysctls are host-level and cannot be set per-container # Only net.ipv4.* sysctls that are network-namespace aware work here sysctls: - net.ipv4.tcp_fin_timeout=15 # Faster cleanup of FIN_WAIT2 sockets - net.ipv4.ip_local_port_range=1024 65535 # More ephemeral ports for upstream ulimits: nofile: soft: 65535 hard: 65535 healthcheck: test: ["CMD", "curl", "-f", "http://localhost/health"] interval: 30s timeout: 5s retries: 3 start_period: 10s deploy: resources: limits: cpus: '4' memory: 1G reservations: cpus: '2' memory: 512M # ────────────────────────────────────────────────────────────────────── # MCP Gateway - the main API server for the MCP stack # ────────────────────────────────────────────────────────────────────── gateway: image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} # Use the local latest image. Run `make docker-prod` to build it. #image: ghcr.io/ibm/mcp-context-forge:1.0.0-BETA-1 # Use the release MCP Context Forge image #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0 build: context: . dockerfile: Containerfile.lite # Same one the Makefile builds restart: unless-stopped # NOTE: When using replicas > 1, access via nginx:8080 instead of direct port 4444 # ports: # - "4444:4444" # Disabled for multi-replica mode networks: [mcpnet] # ────────────────────────────────────────────────────────────────────── # Environment - pick ONE database URL line, comment the rest # ────────────────────────────────────────────────────────────────────── environment: # ═══════════════════════════════════════════════════════════════════════════ # HTTP Server Selection: gunicorn vs granian # ═══════════════════════════════════════════════════════════════════════════ # Performance comparison (2500 concurrent users, PostgreSQL backend): # Gunicorn: ~2.7GB RAM, ~740% CPU, no backpressure (queues unbounded) # Granian: ~4.0GB RAM, ~680% CPU, native backpressure (rejects excess with 503) # # Choose Gunicorn for: memory-constrained environments (32% less RAM) # Choose Granian for: load spike protection, bursty traffic (graceful degradation) # Both achieve same RPS when database is the bottleneck. # ═══════════════════════════════════════════════════════════════════════════ # - HTTP_SERVER=granian # Rust-based, native backpressure, +47% memory, -8% CPU - HTTP_SERVER=gunicorn # Python-based, battle-tested, lower memory usage - HOST=0.0.0.0 - PORT=4444 # Domain for CORS/cookies (nginx default at http://localhost:8080) - APP_DOMAIN=${APP_DOMAIN:-http://localhost:8080} # Transport: sse, streamablehttp, http, or all (default: all) - TRANSPORT_TYPE=streamablehttp # Database connection: Via PgBouncer (default) or direct PostgreSQL # PgBouncer provides connection pooling for better performance under high concurrency - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/mcp # Direct PostgreSQL connection (bypass PgBouncer - increase DB_POOL_SIZE if using): # - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp # SQLAlchemy query logging (useful for N+1 detection; noisy under load) # NOTE: SQLALCHEMY_ECHO logs at INFO; set LOG_LEVEL=INFO/DEBUG to see output. - SQLALCHEMY_ECHO=false - CACHE_TYPE=redis # backend for caching (memory, redis, database, or none) - REDIS_URL=redis://redis:6379/0 # Redis parser: hiredis (C extension ~83x faster for large responses) - REDIS_PARSER=hiredis # Redis connection pool tuning for load testing (3 replicas × 24 workers × 100 = 7200 < 10000 maxclients) - REDIS_MAX_CONNECTIONS=100 - REDIS_SOCKET_TIMEOUT=5.0 - REDIS_SOCKET_CONNECT_TIMEOUT=5.0 - REDIS_HEALTH_CHECK_INTERVAL=30 # ═══════════════════════════════════════════════════════════════════════════ # Redis Startup Resilience (prevents crash-loop on Redis outage) # ═══════════════════════════════════════════════════════════════════════════ # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s... # 30 retries = ~5 minutes total wait before worker gives up - REDIS_MAX_RETRIES=30 # Max attempts before worker exits (default: 30) - REDIS_RETRY_INTERVAL_MS=2000 # Base interval, grows exponentially with jitter - REDIS_MAX_BACKOFF_SECONDS=30 # Max backoff cap (jitter ±25% applied after) # Auth Cache Configuration (reduces DB queries per auth request from 3-4 to 0-1) - AUTH_CACHE_ENABLED=${AUTH_CACHE_ENABLED:-true} - AUTH_CACHE_USER_TTL=300 - AUTH_CACHE_REVOCATION_TTL=120 - AUTH_CACHE_TEAM_TTL=300 - AUTH_CACHE_ROLE_TTL=300 - AUTH_CACHE_BATCH_QUERIES=true - AUTH_CACHE_TEAMS_TTL=300 # Registry Cache Configuration (reduces DB queries for list endpoints) - REGISTRY_CACHE_ENABLED=true - REGISTRY_CACHE_TOOLS_TTL=300 - REGISTRY_CACHE_PROMPTS_TTL=300 - REGISTRY_CACHE_RESOURCES_TTL=300 - REGISTRY_CACHE_AGENTS_TTL=300 - REGISTRY_CACHE_SERVERS_TTL=300 - REGISTRY_CACHE_GATEWAYS_TTL=300 - REGISTRY_CACHE_CATALOG_TTL=300 # Admin Stats Cache Configuration (reduces aggregate queries for dashboard) - ADMIN_STATS_CACHE_ENABLED=true - ADMIN_STATS_CACHE_SYSTEM_TTL=60 - ADMIN_STATS_CACHE_OBSERVABILITY_TTL=30 - ADMIN_STATS_CACHE_TAGS_TTL=120 - ADMIN_STATS_CACHE_PLUGINS_TTL=120 - ADMIN_STATS_CACHE_PERFORMANCE_TTL=60 # Team member count cache (reduces N+1 queries) - TEAM_MEMBER_COUNT_CACHE_ENABLED=true - TEAM_MEMBER_COUNT_CACHE_TTL=300 # Metrics aggregation cache (reduces full table scans, see #1906) - METRICS_CACHE_ENABLED=true - METRICS_CACHE_TTL_SECONDS=120 # MCP Server Health Check # Interval in seconds between health checks (default: 300) - HEALTH_CHECK_INTERVAL=300 # Timeout in seconds for each health check request (default: 5) - HEALTH_CHECK_TIMEOUT=5 # Consecutive failures before marking gateway offline (default: 3) - UNHEALTHY_THRESHOLD=3 # Gateway URL validation timeout in seconds (default: 5) - GATEWAY_VALIDATION_TIMEOUT=5 # Max concurrent health checks per worker (default: 10) - MAX_CONCURRENT_HEALTH_CHECKS=10 # JWT Configuration - Choose ONE approach: # Option 1: HMAC (Default - Simple deployments) - JWT_ALGORITHM=HS256 - JWT_SECRET_KEY=my-test-key # Option 2: RSA (Production - Asymmetric, uncomment and generate certs) # - JWT_ALGORITHM=RS256 # - JWT_PUBLIC_KEY_PATH=/app/certs/jwt/public.pem # - JWT_PRIVATE_KEY_PATH=/app/certs/jwt/private.pem - JWT_AUDIENCE=mcpgateway-api - JWT_ISSUER=mcpgateway # Basic auth is DISABLED by default for security (API_ALLOW_BASIC_AUTH=false) # Only set these if you explicitly enable Basic auth for backwards compatibility # - API_ALLOW_BASIC_AUTH=true # - BASIC_AUTH_USER=${BASIC_AUTH_USER:-admin} # - BASIC_AUTH_PASSWORD=${BASIC_AUTH_PASSWORD:-changeme} # Auth encryption secret + default user password - AUTH_ENCRYPTION_SECRET=${AUTH_ENCRYPTION_SECRET:-my-test-salt} - DEFAULT_USER_PASSWORD=${DEFAULT_USER_PASSWORD:-changeme} # Admin UI uses email/password authentication - EMAIL_AUTH_ENABLED=true - PLATFORM_ADMIN_EMAIL=admin@example.com - PLATFORM_ADMIN_PASSWORD=changeme # Security defaults (tokens require expiration and JTI for revocation) - REQUIRE_TOKEN_EXPIRATION=${REQUIRE_TOKEN_EXPIRATION:-true} - REQUIRE_JTI=${REQUIRE_JTI:-true} - REQUIRE_USER_IN_DB=${REQUIRE_USER_IN_DB:-false} - MCPGATEWAY_UI_ENABLED=true - MCPGATEWAY_ADMIN_API_ENABLED=true # Security configuration (using defaults) - ENVIRONMENT=development - SECURITY_HEADERS_ENABLED=true - CORS_ALLOW_CREDENTIALS=true - SECURE_COOKIES=false # ═══════════════════════════════════════════════════════════════════════════ # SSRF Protection (Server-Side Request Forgery) # ═══════════════════════════════════════════════════════════════════════════ # Prevents gateway from accessing internal resources or cloud metadata services. # Default: enabled with safe settings for dev/internal deployments. # Cloud metadata (169.254.169.254, etc.) is ALWAYS blocked by default. # - SSRF_PROTECTION_ENABLED=true # Master switch (default: true) # - SSRF_ALLOW_LOCALHOST=true # Allow localhost (default: true for dev) # - SSRF_ALLOW_PRIVATE_NETWORKS=true # Allow 10.x, 172.16.x, 192.168.x (default: true) # - SSRF_DNS_FAIL_CLOSED=false # Reject on DNS failure (default: false = fail open) # For strict production mode (external endpoints only): # - SSRF_ALLOW_LOCALHOST=false # - SSRF_ALLOW_PRIVATE_NETWORKS=false # - SSRF_DNS_FAIL_CLOSED=true # Uncomment to enable stateful sessions for Streamable HTTP transport # - USE_STATEFUL_SESSIONS=true # Uncomment to enable session affinity between downstream (from client) and upstrean (to MCP server) sessions # - MCPGATEWAY_SESSION_AFFINITY_ENABLED=true ## Uncomment to enable HTTPS (run `make certs` first) # - SSL=true # - CERT_FILE=/app/certs/cert.pem # - KEY_FILE=/app/certs/key.pem # For passphrase-protected keys: run `make certs-passphrase` and use: # - KEY_FILE=/app/certs/key-encrypted.pem # - KEY_FILE_PASSWORD=${KEY_FILE_PASSWORD} # Uncomment to enable plugins - PLUGINS_ENABLED=true # Uncomment to enable catalog - MCPGATEWAY_CATALOG_ENABLED=true - MCPGATEWAY_CATALOG_FILE=/app/mcp-catalog.yml # Authentication configuration - AUTH_REQUIRED=true - MCP_CLIENT_AUTH_ENABLED=true - TRUST_PROXY_AUTH=false # Logging configuration # NOTE: LOG_LEVEL=INFO/DEBUG is required for SQLALCHEMY_ECHO output. - LOG_LEVEL=${LOG_LEVEL:-ERROR} # Required for SQLALCHEMY_ECHO output during load testing - DISABLE_ACCESS_LOG=true # Disable uvicorn access logs for performance (massive I/O overhead) # Template auto-reload disabled for performance (prevents re-parsing templates on each request) - TEMPLATES_AUTO_RELOAD=false - STRUCTURED_LOGGING_DATABASE_ENABLED=false # Disable DB logging for performance (use true only for debugging) # Audit trail logging - disabled by default for performance # WARNING: Causes a DB write on EVERY API request - can generate millions of rows during load testing! - AUDIT_TRAIL_ENABLED=false # Set to true for compliance requirements (SOC2, HIPAA, etc.) # Security event logging - disabled by default for performance # WARNING: "all" level logs every request and causes massive DB write load - SECURITY_LOGGING_ENABLED=false # Set to true to enable security event logging - SECURITY_LOGGING_LEVEL=failures_only # Options: all, failures_only, high_severity # Performance optimizations - disable CPU-intensive middlewares # NOTE: Keep compression enabled when running without nginx that already has compression # Disabling causes throughput drop due to larger payloads - COMPRESSION_ENABLED=false # Disable optional middlewares for maximum throughput - VALIDATION_MIDDLEWARE_ENABLED=true - CORRELATION_ID_ENABLED=false - LLMCHAT_ENABLED=true - OBSERVABILITY_ENABLED=false # ═══════════════════════════════════════════════════════════════════════════ # Database Connection Pool Configuration # ═══════════════════════════════════════════════════════════════════════════ # Pool class options: # - "null": NullPool - no application pooling, PgBouncer handles all pooling (recommended) # - "queue": QueuePool - application-side pooling (use with direct PostgreSQL) # - "auto": Automatic - NullPool if PgBouncer detected in URL, else QueuePool # # WITH PgBouncer (default in docker-compose): # Option A: NullPool - safest, eliminates stale connection errors, ~10% slower # - DB_POOL_CLASS=null # Option B: QueuePool + pre_ping - better performance, validates before use - DB_POOL_CLASS=queue - DB_POOL_PRE_PING=true # Validate connections before use (SELECT 1) - DB_POOL_SIZE=20 # Pool size per worker - DB_MAX_OVERFLOW=10 # Extra connections under load - DB_POOL_TIMEOUT=60 # Time to wait for connection before failing - DB_POOL_RECYCLE=60 # Recycle before PgBouncer CLIENT_IDLE_TIMEOUT (half of 120s) # ═══════════════════════════════════════════════════════════════════════════ # Database Startup Resilience (prevents crash-loop on DB outage) # ═══════════════════════════════════════════════════════════════════════════ # With exponential backoff: 2s, 4s, 8s, 16s, 30s (capped), 30s... # 30 retries = ~5 minutes total wait before worker gives up - DB_MAX_RETRIES=30 # Max attempts before worker exits (default: 30) - DB_RETRY_INTERVAL_MS=2000 # Base interval, grows exponentially with jitter - DB_MAX_BACKOFF_SECONDS=30 # Max backoff cap (jitter ±25% applied after) # Tool configuration for high-concurrency load testing - TOOL_TIMEOUT=60 # Seconds before tool invocation times out - MAX_TOOL_RETRIES=3 # Retry attempts for failed tool invocations - TOOL_RATE_LIMIT=60000 # Max tool invocations per minute - TOOL_CONCURRENT_LIMIT=1000 # Max concurrent tool invocations - FEDERATION_TIMEOUT=30 # ═══════════════════════════════════════════════════════════════════════════ # HTTPX Client Connection Pool Configuration # ═══════════════════════════════════════════════════════════════════════════ # Shared HTTP client for all outbound requests (federation, health checks, # A2A, SSO, catalog). Provides ~20x better performance than per-request clients. - HTTPX_MAX_CONNECTIONS=500 # Total connections in pool (increased from 200 for high concurrency) - HTTPX_MAX_KEEPALIVE_CONNECTIONS=300 # Keepalive connections (increased from 100) - HTTPX_KEEPALIVE_EXPIRY=30.0 # Idle connection expiry (seconds) - HTTPX_CONNECT_TIMEOUT=5.0 # TCP connection timeout (seconds) - HTTPX_READ_TIMEOUT=120.0 # Response read timeout (seconds, high for slow tools) - HTTPX_WRITE_TIMEOUT=30.0 # Request write timeout (seconds) - HTTPX_POOL_TIMEOUT=10.0 # Wait for available connection (seconds) - HTTPX_HTTP2_ENABLED=false # HTTP/2 support (requires server support) - HTTPX_ADMIN_READ_TIMEOUT=30.0 # Admin UI/health check timeout (seconds) # ═══════════════════════════════════════════════════════════════════════════ # Gunicorn Configuration (used when HTTP_SERVER=gunicorn) # ═══════════════════════════════════════════════════════════════════════════ - GUNICORN_WORKERS=24 # Worker processes (match CPU cores) - GUNICORN_TIMEOUT=120 # Worker timeout in seconds - GUNICORN_GRACEFUL_TIMEOUT=60 # Grace period for worker shutdown - GUNICORN_KEEP_ALIVE=30 # Keep-alive timeout (matches SSE keepalive) # Worker recycling cleans up MCP SDK stuck task groups (anyio#695 workaround) - GUNICORN_MAX_REQUESTS=1000000 # Recycle workers after 1M requests - GUNICORN_MAX_REQUESTS_JITTER=100000 # ±100000 jitter prevents thundering herd - GUNICORN_BACKLOG=4096 # Connection queue depth # ═══════════════════════════════════════════════════════════════════════════ # Granian Backpressure Configuration (used when HTTP_SERVER=granian) # ═══════════════════════════════════════════════════════════════════════════ # Backpressure provides overload protection by rejecting excess requests with # immediate 503 responses instead of queuing them (which can cause OOM/timeouts). # Total capacity = GRANIAN_WORKERS × GRANIAN_BACKPRESSURE = 16 × 128 = 2048 concurrent # Requests beyond this limit receive immediate 503 (no queuing, no OOM) - GRANIAN_WORKERS=16 - GRANIAN_BACKLOG=4096 - GRANIAN_BACKPRESSURE=128 - GRANIAN_HTTP1_BUFFER_SIZE=524288 - GRANIAN_RESPAWN_FAILED=true # ─────────────────────────────────────────────────────────────────────── # Granian Worker Lifecycle (recycling to prevent resource leaks) # ─────────────────────────────────────────────────────────────────────── # Workaround for granian issue where SSE connections may not be properly # closed after client disconnect, causing CPU spin loops. See: # - https://github.com/emmett-framework/granian/issues/286 # - https://github.com/IBM/mcp-context-forge/issues/2357 # # GRANIAN_WORKERS_LIFETIME: Restart workers after this duration (min 60s) # GRANIAN_WORKERS_MAX_RSS: Restart workers exceeding this memory (MiB) # # Using both provides natural jitter - workers hit memory limits at # different times based on load, with lifetime as a backstop. # - GRANIAN_WORKERS_LIFETIME=3600 # 1 hour max worker lifetime # - GRANIAN_WORKERS_MAX_RSS=512 # 512 MiB max RSS per worker # ─────────────────────────────────────────────────────────────────────── # HTTP/2: Granian supports native HTTP/2 multiplexing, but not useful here because: # - nginx sits in front and downgrades to HTTP/1.1 for upstream connections # - nginx open-source doesn't support HTTP/2 to backends (only nginx Plus does) # - Internal Docker network is fast enough that HTTP/2 gains are negligible # To use HTTP/2, either bypass nginx or use Granian with TLS directly. # - GRANIAN_HTTP=2 # ═══════════════════════════════════════════════════════════════════════════ # MCP Session Pool Configuration # ═══════════════════════════════════════════════════════════════════════════ # Session pooling for MCP ClientSessions reduces per-request overhead from # ~20ms to ~1-2ms (10-20x improvement). Sessions are isolated per user/tenant # via identity hashing to prevent cross-user session sharing. - MCP_SESSION_POOL_ENABLED=true # Enable session pooling (default: false, enabled for docker-compose) - MCP_SESSION_POOL_MAX_PER_KEY=200 # Max sessions per (URL, identity, transport) - increased from 150 for 4000+ users - MCP_SESSION_POOL_TTL=300.0 # Session TTL in seconds (default: 300) - MCP_SESSION_POOL_HEALTH_CHECK_INTERVAL=60.0 # Idle time before health check (default: 60) - MCP_SESSION_POOL_ACQUIRE_TIMEOUT=60.0 # Timeout waiting for session slot (default: 30) - MCP_SESSION_POOL_CREATE_TIMEOUT=30.0 # Timeout creating new session (default: 30) - MCP_SESSION_POOL_CIRCUIT_BREAKER_THRESHOLD=5 # Failures before circuit opens - MCP_SESSION_POOL_CIRCUIT_BREAKER_RESET=60.0 # Seconds before circuit resets - MCP_SESSION_POOL_IDLE_EVICTION=600.0 # Evict idle pool keys after (default: 600) - MCP_SESSION_POOL_TRANSPORT_TIMEOUT=30.0 # Timeout for all HTTP operations (default: 30) - MCP_SESSION_POOL_EXPLICIT_HEALTH_RPC=false # Force RPC on health checks (default: false) # Configurable health check chain - ordered list of methods to try (JSON array) # Options: ping, list_tools, list_prompts, list_resources, skip # - MCP_SESSION_POOL_HEALTH_CHECK_METHODS=["ping", "skip"] # Try ping, skip if unsupported - MCP_SESSION_POOL_HEALTH_CHECK_METHODS=["skip"] # skip, highest performance - MCP_SESSION_POOL_HEALTH_CHECK_TIMEOUT=5.0 # Timeout per health check attempt # ═══════════════════════════════════════════════════════════════════════════ # CPU Spin Loop Mitigation (Issue #2360, anyio#695) # ═══════════════════════════════════════════════════════════════════════════ # These settings mitigate CPU spin loops that can occur when SSE/MCP connections # are cancelled and internal tasks don't respond to CancelledError. The spin # happens in anyio's _deliver_cancellation method. # # See documentation: docs/docs/operations/cpu-spin-loop-mitigation.md # GitHub Issue: https://github.com/IBM/mcp-context-forge/issues/2360 # Upstream Issue: https://github.com/agronholm/anyio/issues/695 # # ───────────────────────────────────────────────────────────────────────── # Layer 1: SSE Connection Protection # ───────────────────────────────────────────────────────────────────────── # Detect and close dead SSE connections before they cause spin loops. - SSE_SEND_TIMEOUT=30.0 # ASGI send() timeout (default: 30.0) - SSE_RAPID_YIELD_WINDOW_MS=1000 # Detection window in ms (default: 1000) - SSE_RAPID_YIELD_MAX=50 # Max yields before disconnect (default: 50, 0=disabled) # ───────────────────────────────────────────────────────────────────────── # Layer 2: Cleanup Timeouts # ───────────────────────────────────────────────────────────────────────── # Limit how long cleanup waits for stuck tasks. Short timeouts (0.5s) reduce # CPU waste during cancelled connection cleanup. Only affects cleanup, not # normal operation. - MCP_SESSION_POOL_CLEANUP_TIMEOUT=0.5 # Session __aexit__ timeout (default: 5.0) - SSE_TASK_GROUP_CLEANUP_TIMEOUT=0.5 # SSE task group timeout (default: 5.0) # ───────────────────────────────────────────────────────────────────────── # Layer 3: EXPERIMENTAL - anyio Monkey-Patch # ───────────────────────────────────────────────────────────────────────── # Last resort: patches anyio to limit _deliver_cancellation iterations. # Enable only if Layers 1-2 don't fully resolve the issue. # WARNING: May be removed when anyio/MCP SDK fix upstream issue. - ANYIO_CANCEL_DELIVERY_PATCH_ENABLED=true # Enable workaround - TESTING - ANYIO_CANCEL_DELIVERY_MAX_ITERATIONS=500 # Max iterations before giving up (~60ms recovery) # ═══════════════════════════════════════════════════════════════════════════ # Execution Metrics Recording # ═══════════════════════════════════════════════════════════════════════════ # Controls tool/resource/prompt/server/A2A execution metrics (one DB row per operation). # Disable when using external observability to improve performance. # Set to true if you need per-operation metrics in the database. # Note: Does NOT affect log aggregation (METRICS_AGGREGATION_ENABLED) or Prometheus. - DB_METRICS_RECORDING_ENABLED=true # ═══════════════════════════════════════════════════════════════════════════ # Metrics Configuration # ═══════════════════════════════════════════════════════════════════════════ # Raw metrics are deleted after hourly rollups exist (default: 1 hour retention). # Rollups preserve all analytics (counts, p50/p95/p99) for 365 days. # # If using external observability (ELK, Datadog, Splunk), raw metrics are # redundant - your external platform handles debugging and audit trails. # # Configurable settings (uncomment to override defaults): # - METRICS_DELETE_RAW_AFTER_ROLLUP=true # Delete raw after rollup (default) # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=1 # Raw retention when rollup exists # - METRICS_CLEANUP_INTERVAL_HOURS=1 # Cleanup frequency (default: hourly) # - METRICS_RETENTION_DAYS=7 # Fallback retention (rollup disabled) # # For debugging without external observability, increase raw retention: # - METRICS_DELETE_RAW_AFTER_ROLLUP_HOURS=168 # Keep raw data 7 days # Phoenix Observability Integration (uncomment when using Phoenix) # - PHOENIX_ENDPOINT=${PHOENIX_ENDPOINT:-http://phoenix:6006} # - OTEL_EXPORTER_OTLP_ENDPOINT=${OTEL_EXPORTER_OTLP_ENDPOINT:-http://phoenix:4317} # - OTEL_SERVICE_NAME=${OTEL_SERVICE_NAME:-mcp-gateway} # - OTEL_TRACES_EXPORTER=${OTEL_TRACES_EXPORTER:-otlp} # - OTEL_METRICS_EXPORTER=${OTEL_METRICS_EXPORTER:-otlp} # - OTEL_RESOURCE_ATTRIBUTES=${OTEL_RESOURCE_ATTRIBUTES:-deployment.environment=docker,service.namespace=mcp} # TCP kernel tuning for high-concurrency MCP tool invocations # Each tool call creates a new connection → many TIME_WAIT sockets sysctls: - net.ipv4.tcp_fin_timeout=15 # Faster cleanup of FIN_WAIT2 sockets (default: 60) - net.ipv4.ip_local_port_range=1024 65535 # More ephemeral ports (default: 32768-60999) ulimits: nofile: soft: 65535 hard: 65535 depends_on: # Default stack: PgBouncer + Redis (PgBouncer depends on Postgres) pgbouncer: condition: service_healthy # ▶ wait for connection pooler redis: condition: service_started # Direct PostgreSQL (uncomment if bypassing PgBouncer): # postgres: # condition: service_healthy # migration: # condition: service_completed_successfully healthcheck: ## Uncomment for HTTP healthcheck test: ["CMD", "python3", "-c", "import urllib.request; import json; resp = urllib.request.urlopen('http://localhost:4444/health', timeout=5); data = json.loads(resp.read()); exit(0 if data.get('status') == 'healthy' else 1)"] ## Uncomment for HTTPS healthcheck (requires valid SSL cert) # test: ["CMD", "curl", "-f", "https://localhost:4444/health"] # HTTPS healthcheck with SSL validation skipped (self-signed certs) # test: ["CMD", "curl", "-fk", "https://localhost:4444/health"] interval: 30s timeout: 10s retries: 5 start_period: 30s # Scaling options: # - Single instance: use port 4444 directly, replicas: 1 # - Multi-instance: comment out ports, set replicas: 2+, access via nginx:8080 # ────────────────────────────────────────────────────────────────────── # Server Engine Selection (Default: Granian - Rust-based HTTP server) # ────────────────────────────────────────────────────────────────────── # Default is Granian. For Gunicorn with Uvicorn workers: # command: ["./run-gunicorn.sh"] deploy: mode: replicated replicas: 3 resources: limits: cpus: '8' memory: 8G reservations: cpus: '4' memory: 4G # ────────────────────────────────────────────────────────────────────── # Volume Mounts # ────────────────────────────────────────────────────────────────────── # Uncomment to mount catalog configuration and SSL certificates # volumes: # - ./mcp-catalog.yml:/app/mcp-catalog.yml:ro # mount catalog configuration # - ./certs:/app/certs:ro # mount certs folder read-only (includes both SSL and JWT keys) # # SSL/TLS Certificate Setup: # 1. Generate certificates: # - Without passphrase: make certs # - With passphrase: make certs-passphrase # 2. Uncomment the volumes mount above # 3. Set SSL environment variables # 4. If using passphrase-protected key, set KEY_FILE_PASSWORD in .env file # # For JWT asymmetric keys: # 1. Generate keys: make certs-jwt # 2. Uncomment volumes mount above # 3. Switch JWT_ALGORITHM to RS256 and uncomment JWT_*_KEY_PATH variables ############################################################################### # DATABASES - enable ONE of these blocks and adjust DATABASE_URL ############################################################################### postgres: image: postgres:18 shm_size: 256m # Increase from 64MB default to prevent shared memory exhaustion under load ulimits: nofile: soft: 8192 hard: 8192 ports: - "5433:5432" # Expose for baseline load testing (5433 to avoid conflict with local postgres) # Performance tuning for high-load testing (3000 sustained users) # WITH PgBouncer (default): 800 connections provides headroom for 700 pool + system overhead # DIRECT connection mode: increase to 4000 for (3 replicas × 16 workers × 80 pool) command: - "postgres" - "-c" - "max_connections=800" # Must exceed PgBouncer MAX_DB_CONNECTIONS (700) + overhead - "-c" - "shared_buffers=512MB" - "-c" - "work_mem=16MB" - "-c" - "effective_cache_size=1536MB" - "-c" - "maintenance_work_mem=128MB" - "-c" - "checkpoint_completion_target=0.9" - "-c" - "wal_buffers=16MB" - "-c" - "random_page_cost=1.1" - "-c" - "effective_io_concurrency=200" - "-c" - "max_worker_processes=8" # Total background workers (must be >= max_parallel_workers) - "-c" - "max_parallel_workers_per_gather=4" # Max workers per query's parallel operation - "-c" - "max_parallel_workers=8" # Total parallel workers available system-wide # === HIGH-CONCURRENCY TUNING (3000 users) === # CRITICAL: idle_in_transaction_session_timeout prevents connection starvation # Application code now properly closes transactions via get_db() commit-on-success pattern # This timeout is a safety net for any edge cases - "-c" - "idle_in_transaction_session_timeout=300s" # Kill stuck transactions after 300s (aligned with PgBouncer) - "-c" - "statement_timeout=120s" # Kill runaway queries after 120s - "-c" - "synchronous_commit=off" # Async WAL writes (2-10x faster commits) - "-c" - "commit_delay=100" # Batch commits within 100μs window # ═══════════════════════════════════════════════════════════════════════════ # AUTOVACUUM TUNING - High-insert workloads (metrics tables) # ═══════════════════════════════════════════════════════════════════════════ # High insert rates cause dead tuple accumulation. These settings help # PostgreSQL keep up with table bloat from metrics writes. # Uncomment if experiencing performance degradation under sustained load: # - "-c" # - "autovacuum_naptime=30s" # Check more frequently (default: 60s) # - "-c" # - "autovacuum_vacuum_scale_factor=0.05" # Vacuum at 5% dead tuples (default: 0.2) # - "-c" # - "autovacuum_vacuum_cost_limit=1000" # More vacuum work per cycle (default: 200) # === PG_STAT_STATEMENTS + AUTO_EXPLAIN === # Query performance tracking and slow query plan logging # NOTE: Both extensions must be in the SAME shared_preload_libraries line! # After enabling, run in psql: # CREATE EXTENSION IF NOT EXISTS pg_stat_statements; # SELECT * FROM pg_stat_statements ORDER BY total_exec_time DESC LIMIT 10; # - "-c" # - "shared_preload_libraries=pg_stat_statements" # - "shared_preload_libraries=pg_stat_statements,auto_explain" # Use this line to enable both # - "-c" # - "pg_stat_statements.track=all" # - "-c" #- "pg_stat_statements.max=10000" # AUTO_EXPLAIN settings (uncomment if using combined shared_preload_libraries above) # - "-c" # - "auto_explain.log_min_duration=1000" # - "-c" # - "auto_explain.log_analyze=on" # === ROLLBACK DEBUGGING (disabled for performance) === # - "-c" # - "log_min_error_statement=error" # - "-c" # - "log_min_messages=warning" # - "-c" # - "log_error_verbosity=verbose" # - "-c" # - "log_line_prefix=%t [%p]: user=%u,db=%d,app=%a,client=%h " # - "-c" # - "log_lock_waits=on" # - "-c" # - "deadlock_timeout=1s" # - "-c" # - "log_temp_files=0" # - "-c" # - "log_checkpoints=on" # - "-c" # - "log_connections=on" # - "-c" # - "log_disconnections=on" # - "-c" # - "idle_in_transaction_session_timeout=60s" environment: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=mysecretpassword - POSTGRES_DB=mcp volumes: # - pgdata:/var/lib/postgresql/data - pgdata:/var/lib/postgresql # Enable for postgres 18+ networks: [mcpnet] healthcheck: test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER"] interval: 30s timeout: 5s retries: 5 start_period: 20s deploy: resources: limits: cpus: '4' memory: 8G reservations: cpus: '2' memory: 2G # ────────────────────────────────────────────────────────────────────── # PgBouncer - Connection Pooler for PostgreSQL # Reduces connection overhead, improves throughput under high concurrency. # Enable by switching gateway DATABASE_URL to use pgbouncer:6432 instead of postgres:5432 # ────────────────────────────────────────────────────────────────────── pgbouncer: image: edoburu/pgbouncer:latest restart: unless-stopped networks: [mcpnet] ulimits: nofile: soft: 65536 hard: 65536 ports: - "6432:6432" # PgBouncer port (optional external access) environment: # Connection to upstream PostgreSQL - DATABASE_URL=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp # PgBouncer listen port (default would be 5432, using 6432 to distinguish from PostgreSQL) - LISTEN_PORT=6432 # Pool mode: transaction (recommended), session, or statement # transaction: connection returned after each transaction (best for web apps) - POOL_MODE=transaction # ═══════════════════════════════════════════════════════════════════════════ # Connection Pool Tuning for 3000 Sustained Users # PgBouncer handles connection multiplexing - many app connections share fewer DB connections # ═══════════════════════════════════════════════════════════════════════════ # Client-side limits (from gateway workers via SQLAlchemy) - MAX_CLIENT_CONN=5000 # Max app connections; must exceed (replicas × workers × pool) - DEFAULT_POOL_SIZE=600 # Shared DB connections; sized for ~70 concurrent tx × 8x headroom - MIN_POOL_SIZE=100 # Pre-warmed connections for instant response to load spikes - RESERVE_POOL_SIZE=150 # Emergency pool for burst traffic beyond DEFAULT_POOL_SIZE - RESERVE_POOL_TIMEOUT=2 # Seconds before tapping reserve pool # Server-side limits (to PostgreSQL) - MAX_DB_CONNECTIONS=700 # Max connections to PostgreSQL; must be < PG max_connections - MAX_USER_CONNECTIONS=700 # Per-user limit; typically equals MAX_DB_CONNECTIONS # Connection lifecycle - SERVER_LIFETIME=3600 # Recycle server connections after 1 hour (prevents stale state) - SERVER_IDLE_TIMEOUT=600 # Close unused server connections after 10 min # Timeout settings - QUERY_WAIT_TIMEOUT=60 # Max wait for available connection before failing request - CLIENT_IDLE_TIMEOUT=60 # Close idle client connections (aligned with IDLE_TRANSACTION_TIMEOUT) - SERVER_CONNECT_TIMEOUT=5 # Timeout for new connections to PostgreSQL # Transaction cleanup - critical for avoiding idle-in-transaction buildup # NOTE: In transaction pooling, session-level advisory locks (used by migrations) # can stick unless the reset query clears them; DISCARD ALL is safest. - SERVER_RESET_QUERY=DISCARD ALL # Reset connection state when returned to pool - SERVER_RESET_QUERY_ALWAYS=1 # Always run reset query even after clean transactions - IDLE_TRANSACTION_TIMEOUT=30 # Kill transactions idle > 30s to prevent connection pool exhaustion # Authentication - AUTH_TYPE=scram-sha-256 # Match PostgreSQL auth method depends_on: postgres: condition: service_healthy healthcheck: test: ["CMD", "pg_isready", "-h", "localhost", "-p", "6432"] interval: 10s timeout: 5s retries: 3 start_period: 10s deploy: resources: limits: cpus: '1' memory: 256M reservations: cpus: '0.5' memory: 128M # migration: # #image: ghcr.io/ibm/mcp-context-forge:0.7.0 # Testing migration from 0.7.0 # image: mcpgateway/mcpgateway:latest # Use the local latest image. Run `make docker-prod` to build it. # build: # context: . # dockerfile: Containerfile # environment: # - DATABASE_URL=postgresql+psycopg://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp # command: alembic -c mcpgateway/alembic.ini upgrade head # depends_on: # postgres: # condition: service_healthy # networks: [mcpnet] ############################################################################### # CACHE ############################################################################### redis: image: redis:latest ulimits: nofile: soft: 65536 hard: 65536 # Performance tuning for 1000+ RPS high-concurrency load testing command: - "redis-server" - "--maxmemory" - "1gb" - "--maxmemory-policy" - "allkeys-lru" - "--tcp-backlog" - "2048" - "--timeout" - "0" - "--tcp-keepalive" - "300" - "--maxclients" - "10000" ports: - "6379:6379" # expose only if you want host access networks: [mcpnet] deploy: resources: limits: cpus: '2' memory: 2G reservations: cpus: '1' memory: 1G ############################################################################### # MONITORING STACK (enabled with --profile monitoring) # Usage: docker compose --profile monitoring up -d # Access: Grafana http://localhost:3000 (admin/changeme) # Prometheus http://localhost:9090 ############################################################################### # ────────────────────────────────────────────────────────────────────── # Prometheus PostgreSQL Exporter - Database metrics # Metrics: connections, query duration, locks, cache hit ratio # ────────────────────────────────────────────────────────────────────── postgres_exporter: image: quay.io/prometheuscommunity/postgres-exporter:latest restart: unless-stopped networks: [mcpnet] ports: - "9187:9187" # http://localhost:9187/metrics environment: - DATA_SOURCE_NAME=postgresql://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@postgres:5432/mcp?sslmode=disable - PG_EXPORTER_AUTO_DISCOVER_DATABASES=true depends_on: postgres: condition: service_healthy profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Prometheus Redis Exporter - Cache metrics # Metrics: memory, clients, commands/sec, keyspace stats # ────────────────────────────────────────────────────────────────────── redis_exporter: image: oliver006/redis_exporter:latest restart: unless-stopped networks: [mcpnet] ports: - "9121:9121" # http://localhost:9121/metrics environment: - REDIS_ADDR=redis://redis:6379 depends_on: redis: condition: service_started profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Prometheus PgBouncer Exporter - Connection pool metrics # Metrics: active/waiting clients, server connections, pool stats # ────────────────────────────────────────────────────────────────────── pgbouncer_exporter: image: prometheuscommunity/pgbouncer-exporter:latest restart: unless-stopped networks: [mcpnet] ports: - "9127:9127" # http://localhost:9127/metrics environment: - PGBOUNCER_EXPORTER_CONNECTION_STRING=postgres://postgres:${POSTGRES_PASSWORD:-mysecretpassword}@pgbouncer:6432/pgbouncer?sslmode=disable depends_on: pgbouncer: condition: service_healthy profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Prometheus Nginx Exporter - Proxy metrics # Metrics: active connections, requests/sec, response codes # Requires stub_status enabled in nginx.conf (location /nginx_status) # ────────────────────────────────────────────────────────────────────── nginx_exporter: image: nginx/nginx-prometheus-exporter:latest restart: unless-stopped networks: [mcpnet] ports: - "9113:9113" # http://localhost:9113/metrics command: - '-nginx.scrape-uri=http://nginx:80/nginx_status' depends_on: nginx: condition: service_healthy profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # cAdvisor - Container metrics (CPU, memory, network, disk I/O) # Metrics: container_cpu_usage_seconds_total, container_memory_usage_bytes # Dashboard: Grafana ID 14282 (Docker and cAdvisor) # ────────────────────────────────────────────────────────────────────── cadvisor: image: gcr.io/cadvisor/cadvisor:latest restart: unless-stopped networks: [mcpnet] ports: - "8085:8080" # http://localhost:8085/metrics privileged: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Prometheus - Metrics collection and storage # Scrapes: gateway, postgres, redis, nginx, cadvisor # Retention: 7 days (configurable via --storage.tsdb.retention.time) # ────────────────────────────────────────────────────────────────────── prometheus: image: prom/prometheus:latest restart: unless-stopped networks: [mcpnet] ports: - "9090:9090" # http://localhost:9090 volumes: - ./infra/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheusdata:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.retention.time=7d' - '--web.enable-lifecycle' depends_on: - postgres_exporter - redis_exporter - nginx_exporter - cadvisor profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Loki - Log aggregation system (like Prometheus, but for logs) # Query logs with LogQL in Grafana # ────────────────────────────────────────────────────────────────────── loki: image: grafana/loki:latest restart: unless-stopped networks: [mcpnet] user: "0" # Run as root to avoid permission issues ports: - "3100:3100" # http://localhost:3100/ready volumes: - ./infra/monitoring/loki/loki-config.yaml:/etc/loki/local-config.yaml:ro - lokidata:/loki command: -config.file=/etc/loki/local-config.yaml profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Promtail - Log collector for Loki # Collects logs from all containers via Docker socket # ────────────────────────────────────────────────────────────────────── promtail: image: grafana/promtail:latest restart: unless-stopped networks: [mcpnet] volumes: - ./infra/monitoring/loki/promtail-config.yaml:/etc/promtail/config.yaml:ro - /var/run/docker.sock:/var/run/docker.sock:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro command: -config.file=/etc/promtail/config.yaml depends_on: - loki profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Grafana - Dashboard visualization # Default login: admin / changeme # Recommended dashboards: # - Docker/cAdvisor: 14282 # - PostgreSQL: 9628 # - Redis: 763 # - Nginx: 12708 # ────────────────────────────────────────────────────────────────────── grafana: image: grafana/grafana:latest restart: unless-stopped networks: [mcpnet] user: "0" # Run as root to avoid permission issues with provisioning ports: - "3000:3000" # http://localhost:3000 environment: - GF_SECURITY_ADMIN_PASSWORD=changeme - GF_USERS_ALLOW_SIGN_UP=false volumes: - grafanadata:/var/lib/grafana - ./infra/monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - ./infra/monitoring/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro depends_on: - prometheus profiles: ["monitoring"] ############################################################################### # OPTIONAL ADMIN TOOLS - handy web UIs for DB & cache (disabled by default) ############################################################################### pgadmin: # 🔧 Postgres admin UI image: dpage/pgadmin4:9.11.0 environment: - PGADMIN_DEFAULT_EMAIL=admin@example.com - PGADMIN_DEFAULT_PASSWORD=changeme ports: - "5050:80" # http://localhost:5050 volumes: - pgadmindata:/var/lib/pgadmin networks: [mcpnet] depends_on: postgres: condition: service_healthy profiles: ["monitoring"] # ────────────────────────────────────────────────────────────────────── # Redis Commander - a web-based Redis GUI # ────────────────────────────────────────────────────────────────────── redis_commander: # 🔧 Redis key browser image: rediscommander/redis-commander:latest restart: unless-stopped networks: [mcpnet] depends_on: redis: condition: service_started ports: - "8081:8081" # http://localhost:8081 environment: - REDIS_HOSTS=local:redis:6379 - HTTP_USER=admin - HTTP_PASSWORD=changeme profiles: ["monitoring"] # # ────────────────────────────────────────────────────────────────────── # # Redis Insight - a powerful Redis GUI (recently updated) # # ────────────────────────────────────────────────────────────────────── # redis_insight: # 🔧 Redis Insight GUI # image: redis/redisinsight:latest # container_name: redisinsight # restart: unless-stopped # networks: [mcpnet] # ports: # - "5540:5540" # Redis Insight UI (default 5540) # depends_on: # Default stack: Postgres + Redis # redis: # condition: service_started # # ────────────────────────────────────────────────────────────────────── # # Persist data (config, logs, history) between restarts # # ────────────────────────────────────────────────────────────────────── # # volumes: # # - ./redisinsight_data:/data # volumes: # - redisinsight_data:/data # <- persist data in named volume # # ────────────────────────────────────────────────────────────────────── # # Preconfigure Redis connection(s) via env vars # # ────────────────────────────────────────────────────────────────────── # environment: # # Single connection (omit "*" since only one): # - RI_REDIS_HOST=redis # <- your Redis hostname # - RI_REDIS_PORT=6379 # <- your Redis port # - RI_REDIS_USERNAME=default # <- ACL/username (Redis 6+) # #- RI_REDIS_PASSWORD=changeme # <- Redis AUTH password # #- RI_REDIS_TLS=true # <- enable TLS # # Optional: validate self-signed CA instead of trusting all: # # - RI_REDIS_TLS_CA_PATH=/certs/selfsigned.crt # # - RI_REDIS_TLS_CERT_PATH=/certs/client.crt # # - RI_REDIS_TLS_KEY_PATH=/certs/client.key # # - RI_REDIS_TLS=true # (already set above) # # ────────────────────────────────────────────────────────────────── # # Core Redis Insight settings # # ────────────────────────────────────────────────────────────────── # - RI_APP_HOST=0.0.0.0 # <- listen on all interfaces # - RI_APP_PORT=5540 # <- UI port (container-side) ############################################################################### # OPTIONAL MCP SERVERS - drop-in helpers the Gateway can call ############################################################################### ############################################################################### # Fast Time Server - High-performance time/timezone service for MCP # Uses pre-built image by default. On ARM64, build locally: # FAST_TIME_IMAGE=mcpgateway/fast-time-server:local docker compose build fast_time_server ############################################################################### fast_time_server: image: ${FAST_TIME_IMAGE:-ghcr.io/ibm/fast-time-server:latest} build: context: ./mcp-servers/go/fast-time-server dockerfile: Dockerfile restart: unless-stopped networks: [mcpnet] ports: - "8888:8080" # Map host port 8888 to container port 8080 # Use dual mode for both SSE (/sse) and Streamable HTTP (/http) endpoints command: ["-transport=dual", "-listen=0.0.0.0", "-port=8080", "-log-level=info"] ############################################################################### # Auto-registration service - registers fast_time_server with gateway ############################################################################### register_fast_time: image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} networks: [mcpnet] depends_on: gateway: condition: service_healthy fast_time_server: condition: service_started environment: - JWT_SECRET_KEY=my-test-key # This is a one-shot container that exits after registration restart: "no" entrypoint: ["/bin/sh", "-c"] command: - | echo "Using latest gateway image with current JWT utility..." echo "Waiting for services to be ready..." # Wait for gateway to be ready using Python python3 -c " import time import urllib.request import urllib.error for i in range(1, 61): try: with urllib.request.urlopen('http://gateway:4444/health', timeout=2) as response: if response.status == 200: print('✅ gateway is healthy') break except: pass print(f'Waiting for gateway... ({i}/60)') time.sleep(2) else: print('❌ Gateway failed to become healthy') exit(1) " # Wait for fast_time_server to be ready using Python python3 -c " import time import urllib.request import urllib.error for i in range(1, 31): try: with urllib.request.urlopen('http://fast_time_server:8080/health', timeout=2) as response: if response.status == 200: print('✅ fast_time_server is healthy') break except: pass print(f'Waiting for fast_time_server... ({i}/30)') time.sleep(2) else: print('❌ Fast time server failed to become healthy') exit(1) " echo "Generating JWT token..." echo "Environment: JWT_SECRET_KEY=$$JWT_SECRET_KEY" echo "Running: python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256" # Only capture stdout (the token), let warnings go to stderr export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null) echo "Generated token: $$MCPGATEWAY_BEARER_TOKEN" # Decode the token to verify it has expiration echo "Decoding token to verify claims..." python3 -m mcpgateway.utils.create_jwt_token --decode "$$MCPGATEWAY_BEARER_TOKEN" 2>/dev/null || echo "Failed to decode token" # Test authentication first echo "Testing authentication..." # Use Python to make HTTP requests python3 -c " import urllib.request import urllib.error import json import sys import os import time token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '') def api_request(method, path, data=None): '''Helper to make authenticated API requests.''' url = f'http://gateway:4444{path}' req = urllib.request.Request(url, method=method) req.add_header('Authorization', f'Bearer {token}') req.add_header('Content-Type', 'application/json') if data: req.data = json.dumps(data).encode('utf-8') with urllib.request.urlopen(req) as response: return json.loads(response.read().decode('utf-8')) # Test version endpoint without auth print('Checking gateway config...') try: with urllib.request.urlopen('http://gateway:4444/version') as response: data = response.read().decode('utf-8') print(f'Gateway version response (no auth): {data[:200]}') except Exception as e: print(f'Version check failed: {e}') # Test version endpoint with auth print('Testing authentication...') try: req = urllib.request.Request('http://gateway:4444/version') req.add_header('Authorization', f'Bearer {token}') with urllib.request.urlopen(req) as response: data = response.read().decode('utf-8') print(f'Auth test response: SUCCESS') auth_success = True except Exception as e: print(f'Auth test response: FAILED - {e}') auth_success = False # Register fast_time_server with gateway using Streamable HTTP transport print('Registering fast_time_server with gateway (Streamable HTTP)...') # First check if gateway already exists and delete it gateway_id = None try: gateways = api_request('GET', '/gateways') for gw in gateways: if gw.get('name') == 'fast_time': print(f'Found existing gateway {gw[\"id\"]}, deleting...') api_request('DELETE', f'/gateways/{gw[\"id\"]}') print('Deleted existing gateway') except Exception as e: print(f'Note: Could not check/delete existing gateway: {e}') # Delete existing virtual server if present (using fixed ID) VIRTUAL_SERVER_ID = '9779b6698cbd4b4995ee04a4fab38737' try: api_request('DELETE', f'/servers/{VIRTUAL_SERVER_ID}') print(f'Deleted existing virtual server {VIRTUAL_SERVER_ID}') except Exception as e: print(f'Note: No existing virtual server to delete (or error: {e})') # Register the gateway try: result = api_request('POST', '/gateways', { 'name': 'fast_time', 'url': 'http://fast_time_server:8080/http', 'transport': 'STREAMABLEHTTP' }) print(f'Registration response: {result}') if 'id' in result: gateway_id = result['id'] print(f'✅ Successfully registered fast_time_server (gateway_id: {gateway_id})') else: print('❌ Registration failed - no ID in response') sys.exit(1) except Exception as e: print(f'❌ Registration failed: {e}') sys.exit(1) # Wait for tools to be synced from the gateway print('Waiting for tools/resources/prompts to sync...') for i in range(30): time.sleep(1) try: tools = api_request('GET', '/tools') # Filter tools from fast_time gateway (note: camelCase gatewayId) fast_time_tools = [t for t in tools if t.get('gatewayId') == gateway_id] if fast_time_tools: print(f'Found {len(fast_time_tools)} tools from fast_time gateway') break except Exception as e: pass print(f'Waiting for sync... ({i+1}/30)') else: print('⚠️ No tools synced, continuing anyway...') # Fetch all tools, resources, and prompts # Note: Tools use gatewayId (camelCase), resources/prompts from catalog have no gatewayId tool_ids = [] resource_ids = [] prompt_ids = [] try: tools = api_request('GET', '/tools') # Get tools from the fast_time gateway tool_ids = [t['id'] for t in tools if t.get('gatewayId') == gateway_id] print(f'Found tools: {[t[\"name\"] for t in tools if t.get(\"gatewayId\") == gateway_id]}') except Exception as e: print(f'Failed to fetch tools: {e}') try: resources = api_request('GET', '/resources') # Include all resources (from catalog) resource_ids = [r['id'] for r in resources] print(f'Found resources: {[r[\"name\"] for r in resources]}') except Exception as e: print(f'Failed to fetch resources: {e}') try: prompts = api_request('GET', '/prompts') # Include all prompts (from catalog) prompt_ids = [p['id'] for p in prompts] print(f'Found prompts: {[p[\"name\"] for p in prompts]}') except Exception as e: print(f'Failed to fetch prompts: {e}') # Create virtual server with all tools, resources, and prompts print('Creating virtual server...') try: # API expects payload wrapped in 'server' key # Use fixed UUID for consistent server ID across restarts server_payload = { 'server': { 'id': '9779b6698cbd4b4995ee04a4fab38737', 'name': 'Fast Time Server', 'description': 'Virtual server exposing Fast Time MCP tools, resources, and prompts', 'associated_tools': tool_ids, 'associated_resources': resource_ids, 'associated_prompts': prompt_ids } } result = api_request('POST', '/servers', server_payload) print(f'Virtual server created: {result}') print(f'✅ Successfully created virtual server with {len(tool_ids)} tools, {len(resource_ids)} resources, {len(prompt_ids)} prompts') except Exception as e: print(f'❌ Failed to create virtual server: {e}') sys.exit(1) " # Write the bearer token to a file for load testing echo "Writing bearer token to /tmp/gateway-token.txt..." echo "$$MCPGATEWAY_BEARER_TOKEN" > /tmp/gateway-token.txt echo "Token written to /tmp/gateway-token.txt" echo "✅ Setup complete!" ############################################################################### # Fast Test Server - Ultra-fast Rust MCP server for performance testing # Provides: echo, get_system_time, get_stats tools via MCP Streamable HTTP # Also exposes REST API endpoints for baseline comparison # Usage: docker compose --profile testing up -d ############################################################################### fast_test_server: build: context: ./mcp-servers/rust/fast-test-server dockerfile: Containerfile image: mcpgateway/fast-test-server:latest restart: unless-stopped networks: [mcpnet] ports: - "8880:8880" # Port 8880 (avoids conflict with benchmark servers on 9000+) environment: - BIND_ADDRESS=0.0.0.0:8880 - RUST_LOG=info # TCP kernel tuning for high-concurrency load testing sysctls: - net.ipv4.tcp_fin_timeout=15 # Faster cleanup of FIN_WAIT2 sockets - net.ipv4.ip_local_port_range=1024 65535 # More ephemeral ports - net.core.somaxconn=65535 # Max listen backlog ulimits: nofile: soft: 65535 hard: 65535 healthcheck: test: ["CMD", "curl", "-sf", "http://localhost:8880/health"] interval: 30s timeout: 5s retries: 3 start_period: 10s deploy: resources: limits: cpus: '2' memory: 1G reservations: cpus: '0.5' memory: 128M profiles: ["testing", "monitoring"] ############################################################################### # Auto-registration service - registers fast_test_server with gateway ############################################################################### register_fast_test: image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} networks: [mcpnet] depends_on: gateway: condition: service_healthy fast_test_server: condition: service_healthy environment: - JWT_SECRET_KEY=my-test-key restart: "no" entrypoint: ["/bin/sh", "-c"] command: - | echo "Registering fast_test_server with gateway..." # Generate JWT token export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null) # Register using Python python3 -c " import urllib.request import json import os import time token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '') def api_request(method, path, data=None): url = f'http://gateway:4444{path}' req = urllib.request.Request(url, method=method) req.add_header('Authorization', f'Bearer {token}') req.add_header('Content-Type', 'application/json') if data: req.data = json.dumps(data).encode('utf-8') with urllib.request.urlopen(req) as response: return json.loads(response.read().decode('utf-8')) # Delete existing gateway if present try: gateways = api_request('GET', '/gateways') for gw in gateways: if gw.get('name') == 'fast_test': print(f'Deleting existing gateway {gw[\"id\"]}...') api_request('DELETE', f'/gateways/{gw[\"id\"]}') except Exception as e: print(f'Note: {e}') # Register the gateway try: result = api_request('POST', '/gateways', { 'name': 'fast_test', 'url': 'http://fast_test_server:8880/mcp', 'transport': 'STREAMABLEHTTP' }) print(f'✅ Registered fast_test_server: {result.get(\"id\", \"unknown\")}') except Exception as e: print(f'❌ Registration failed: {e}') exit(1) " echo "✅ Registration complete!" profiles: ["testing", "monitoring"] ############################################################################### # Benchmark Server - Multi-server MCP benchmark tool # Spawns multiple lightweight MCP servers for load testing # Usage: make benchmark-up (or: docker compose --profile benchmark up -d) # # Environment variables: # BENCHMARK_SERVER_COUNT - Number of servers to spawn (default: 10) # BENCHMARK_START_PORT - Starting port number (default: 9000) ############################################################################### benchmark_server: build: context: ./mcp-servers/go/benchmark-server dockerfile: Dockerfile image: mcpgateway/benchmark-server:latest restart: unless-stopped networks: [mcpnet] command: - "-transport=http" - "-server-count=${BENCHMARK_SERVER_COUNT:-10}" - "-start-port=${BENCHMARK_START_PORT:-9000}" - "-tools=50" - "-resources=20" - "-prompts=10" ports: # Port range supports up to 100 servers (9000-9099) # Actual servers spawned controlled by BENCHMARK_SERVER_COUNT - "9000-9099:9000-9099" # Note: No healthcheck - scratch-based Go image has no shell # Verify health via: curl http://localhost:9000/health deploy: resources: limits: cpus: '2' memory: 1G reservations: cpus: '0.5' memory: 256M profiles: ["benchmark"] ############################################################################### # Auto-registration service - registers benchmark servers with gateway # Uses BENCHMARK_SERVER_COUNT and BENCHMARK_START_PORT environment variables ############################################################################### register_benchmark: image: ${IMAGE_LOCAL:-mcpgateway/mcpgateway:latest} networks: [mcpnet] depends_on: gateway: condition: service_healthy benchmark_server: condition: service_started environment: - JWT_SECRET_KEY=my-test-key - BENCHMARK_SERVER_COUNT=${BENCHMARK_SERVER_COUNT:-10} - BENCHMARK_START_PORT=${BENCHMARK_START_PORT:-9000} restart: "no" entrypoint: ["/bin/sh", "-c"] command: - | echo "Registering benchmark servers with gateway..." # Wait for benchmark servers to start (no healthcheck available) echo "Waiting for benchmark servers to start..." sleep 5 # Generate JWT token export MCPGATEWAY_BEARER_TOKEN=$$(python3 -m mcpgateway.utils.create_jwt_token --username admin@example.com --exp 10080 --secret my-test-key --algo HS256 2>/dev/null) # Register benchmark servers using environment variables python3 -c " import urllib.request import json import os token = os.environ.get('MCPGATEWAY_BEARER_TOKEN', '') server_count = int(os.environ.get('BENCHMARK_SERVER_COUNT', '10')) start_port = int(os.environ.get('BENCHMARK_START_PORT', '9000')) headers = { 'Authorization': f'Bearer {token}', 'Content-Type': 'application/json' } def api_request(method, path, data=None): url = f'http://gateway:4444{path}' body = json.dumps(data).encode() if data else None req = urllib.request.Request(url, data=body, headers=headers, method=method) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode()) # Register benchmark servers print(f'Registering {server_count} benchmark servers (ports {start_port}-{start_port + server_count - 1})...') registered = 0 for port in range(start_port, start_port + server_count): name = f'benchmark-{port}' try: result = api_request('POST', '/gateways', { 'name': name, 'url': f'http://benchmark_server:{port}/mcp', 'transport': 'STREAMABLEHTTP' }) print(f'✅ Registered {name}: {result.get(\"id\", \"unknown\")}') registered += 1 except urllib.error.HTTPError as e: if e.code == 409: print(f'⏭️ {name} already registered') registered += 1 else: print(f'❌ Failed to register {name}: HTTP {e.code}') except Exception as e: print(f'❌ Failed to register {name}: {e}') print(f'✅ Registration complete: {registered}/{server_count} benchmark servers') " profiles: ["benchmark"] ############################################################################### # TLS PROFILE - Zero-config HTTPS via Nginx (enabled with --profile tls) # Usage: make compose-tls (or: docker compose --profile tls up -d) # # Features: # - Auto-generates self-signed certificates on first run # - Supports custom certificates (CA-signed or your own) # - Supports passphrase-protected keys (auto-decrypted for nginx) # - HTTPS on port 8443, HTTP on port 8080 (both available) # - Compatible with other profiles: --profile tls --profile monitoring # # ═══════════════════════════════════════════════════════════════════════════ # Bringing Your Own Certificates # ═══════════════════════════════════════════════════════════════════════════ # # Option 1: Unencrypted Private Key (no passphrase) # ─────────────────────────────────────────────────────────────────────────── # mkdir -p certs # cp /path/to/your/certificate.pem certs/cert.pem # cp /path/to/your/private-key.pem certs/key.pem # make compose-tls # # Option 2: Passphrase-Protected Private Key # ─────────────────────────────────────────────────────────────────────────── # mkdir -p certs # cp /path/to/your/certificate.pem certs/cert.pem # cp /path/to/your/encrypted-key.pem certs/key-encrypted.pem # echo "KEY_FILE_PASSWORD=your-passphrase" >> .env # make compose-tls # # The cert_init service will automatically decrypt key-encrypted.pem to # key.pem for nginx (nginx doesn't support passphrase-protected keys). # # Option 3: Generate Self-Signed with Passphrase # ─────────────────────────────────────────────────────────────────────────── # make certs-passphrase # Generates cert + key-encrypted.pem # echo "KEY_FILE_PASSWORD=your-passphrase" >> .env # make compose-tls # Auto-decrypts for nginx ############################################################################### # ────────────────────────────────────────────────────────────────────── # Certificate Initialization - Auto-generates self-signed certs if missing # Supports passphrase-protected keys via KEY_FILE_PASSWORD # ────────────────────────────────────────────────────────────────────── cert_init: image: alpine/openssl:latest volumes: - ./certs:/certs environment: - KEY_FILE_PASSWORD=${KEY_FILE_PASSWORD:-} entrypoint: ["/bin/sh", "-c"] command: - | # Check if we have an encrypted key that needs decryption if [ -f /certs/key-encrypted.pem ] && [ -n "${KEY_FILE_PASSWORD}" ]; then # Validate: encrypted key requires matching certificate if [ ! -f /certs/cert.pem ]; then echo "❌ Found key-encrypted.pem but cert.pem is missing" echo " Please provide both files: cert.pem and key-encrypted.pem" exit 1 fi echo "🔓 Decrypting passphrase-protected key for nginx..." if [ -f /certs/key.pem ]; then echo "⚠️ Overwriting existing key.pem with decrypted version" fi # Decrypt the key for nginx (nginx doesn't support passphrase-protected keys) # Using env: prefix to avoid exposing password in process listing openssl rsa -in /certs/key-encrypted.pem -out /certs/key.pem -passin env:KEY_FILE_PASSWORD if [ $? -eq 0 ]; then chmod 640 /certs/key.pem echo "✅ Successfully decrypted key-encrypted.pem to key.pem" else echo "❌ Failed to decrypt key-encrypted.pem - check KEY_FILE_PASSWORD" exit 1 fi fi # Check if we already have unencrypted certs if [ -f /certs/cert.pem ] && [ -f /certs/key.pem ]; then echo "✅ Certificates found in ./certs - using existing" exit 0 fi # Generate new self-signed certificate (without passphrase for nginx) echo "🔏 Generating self-signed TLS certificate..." mkdir -p /certs openssl req -x509 -newkey rsa:4096 -sha256 -days 365 -nodes \ -keyout /certs/key.pem -out /certs/cert.pem \ -subj "/CN=localhost" \ -addext "subjectAltName=DNS:localhost,DNS:gateway,DNS:nginx,IP:127.0.0.1" chmod 644 /certs/cert.pem chmod 640 /certs/key.pem echo "✅ TLS certificate generated in ./certs" profiles: ["tls"] # ────────────────────────────────────────────────────────────────────── # Nginx TLS - HTTPS-enabled reverse proxy (overrides default nginx) # ────────────────────────────────────────────────────────────────────── nginx_tls: build: context: ./infra/nginx dockerfile: Dockerfile image: mcpgateway/nginx-cache:latest restart: unless-stopped ports: - "8080:80" # HTTP caching proxy (public-facing) - "8443:443" # HTTPS caching proxy (public-facing) networks: [mcpnet] environment: # Set to "true" to force all HTTP requests to redirect to HTTPS - NGINX_FORCE_HTTPS=${NGINX_FORCE_HTTPS:-false} depends_on: gateway: condition: service_healthy cert_init: condition: service_completed_successfully volumes: - nginx_cache:/var/cache/nginx # Persistent cache storage - ./infra/nginx/nginx-tls.conf:/etc/nginx/nginx.conf:ro # TLS-enabled config - ./certs:/app/certs:ro # Mount SSL certs # TCP kernel tuning for 3000 concurrent connections sysctls: - net.ipv4.tcp_fin_timeout=15 - net.ipv4.ip_local_port_range=1024 65535 ulimits: nofile: soft: 65535 hard: 65535 healthcheck: test: ["CMD", "curl", "-fk", "https://localhost/health"] interval: 30s timeout: 5s retries: 3 start_period: 10s deploy: resources: limits: cpus: '4' memory: 1G reservations: cpus: '2' memory: 512M profiles: ["tls"]