services: # --------------------------------------------------------------------------- # Gateway - MCP Server for LLM agent integration # --------------------------------------------------------------------------- gateway: build: context: . dockerfile: gateway/Dockerfile container_name: sme_gateway restart: unless-stopped ports: - "8765:8765" environment: - SME_GATEWAY_SECRET=${SME_GATEWAY_SECRET} - SME_ADMIN_PASSWORD=${SME_ADMIN_PASSWORD} depends_on: - sme-operator volumes: - ./data:/app/data networks: - backend-net # --------------------------------------------------------------------------- # autoheal: Monitors HEALTHCHECK status and restarts unhealthy containers. # Docker's own restart policy only fires on container exit/crash, NOT when a # container transitions to "unhealthy". autoheal bridges that gap. # --------------------------------------------------------------------------- autoheal: image: willfarrell/autoheal:latest container_name: autoheal restart: unless-stopped environment: - AUTOHEAL_CONTAINER_LABEL=all volumes: - /var/run/docker.sock:/var/run/docker.sock networks: - backend-net sme-operator: build: context: . dockerfile: Dockerfile.operator container_name: sme_operator restart: unless-stopped ports: - "8000:8000" environment: - SME_DATA_DIR=/app/data # Security — values must be set in a .env file (see .env.example) - SME_GATEWAY_SECRET=${SME_GATEWAY_SECRET} - SME_ADMIN_PASSWORD=${SME_ADMIN_PASSWORD} - SME_HSM_SECRET=${SME_HSM_SECRET} - SME_CORS_ORIGINS=${SME_CORS_ORIGINS:-http://localhost:80,http://localhost:5173} depends_on: postgres: condition: service_healthy volumes: - ./src/sme/vendor:/app/src/sme/vendor - ./data:/app/data networks: - frontend-net - backend-net sme-frontend: build: context: . dockerfile: Dockerfile.frontend container_name: sme_frontend restart: unless-stopped ports: - "80:80" depends_on: - sme-operator networks: - frontend-net gpu-exporter: image: nvcr.io/nvidia/k8s/dcgm-exporter:latest container_name: gpu_exporter restart: unless-stopped command: ["-k"] ports: - "9400:9400" deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - monitoring-net postgres: image: postgres:15-alpine container_name: sme_postgres restart: unless-stopped environment: POSTGRES_USER: ${POSTGRES_USER:-sme_user} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-sme_password} POSTGRES_DB: ${POSTGRES_DB:-sme_nexus} ports: - "5432:5432" volumes: - sme_pg_data:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-sme_user} -d ${POSTGRES_DB:-sme_nexus}"] interval: 10s timeout: 5s retries: 5 start_period: 30s networks: - backend-net # --------------------------------------------------------------------------- # sme-backup: Automated PostgreSQL backup service # Runs daily backups at 2am and retains backups for 7 days # --------------------------------------------------------------------------- sme-backup: image: postgres:15-alpine container_name: sme_backup restart: unless-stopped environment: POSTGRES_HOST: postgres POSTGRES_PORT: 5432 POSTGRES_USER: ${POSTGRES_USER:-sme_user} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-sme_password} POSTGRES_DB: ${POSTGRES_DB:-sme_nexus} SME_BACKUP_DIR: /app/backups SME_BACKUP_RETENTION_DAYS: 7 volumes: - sme_backups:/app/backups command: > sh -c " echo 'Backup container initialized' && # Run backup immediately then daily at 2am python scripts/backup_postgres.py || true && while true; do sleep 86400 && python scripts/backup_postgres.py --cleanup || true done " networks: - backend-net depends_on: - postgres # --------------------------------------------------------------------------- # Ollama - Local LLM runtime (Docker-based) # Accessible at ollama:11434 from other containers # Models stored in persistent volume # --------------------------------------------------------------------------- ollama: image: ollama/ollama:latest container_name: sme_ollama restart: unless-stopped ports: - "11434:11434" environment: - OLLAMA_HOST=0.0.0.0 volumes: - ollama_models:/root/.ollama # Pull default model on first startup (comment out to skip) # To add more models: docker exec sme_ollama ollama pull command: > sh -c "ollama pull llama3.2 || true && ollama serve" deploy: resources: limits: # Allocate sufficient memory for larger models memory: 8G reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - backend-net healthcheck: test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] interval: 30s timeout: 10s retries: 3 start_period: 120s volumes: sme_pg_data: ollama_models: sme_backups: # --------------------------------------------------------------------------- # Named networks for service isolation. # frontend-net : frontend ↔ operator only # backend-net : operator ↔ sidecar ↔ postgres ↔ autoheal # monitoring-net: gpu-exporter (isolated from application traffic) # --------------------------------------------------------------------------- networks: frontend-net: backend-net: monitoring-net: