services:

  # ---------------------------------------------------------------------------
  # Gateway - MCP Server for LLM agent integration
  # ---------------------------------------------------------------------------
  gateway:
    build:
      context: .
      dockerfile: gateway/Dockerfile
    container_name: sme_gateway
    restart: unless-stopped
    ports:
      - "8765:8765"
    environment:
      - SME_GATEWAY_SECRET=${SME_GATEWAY_SECRET}
      - SME_ADMIN_PASSWORD=${SME_ADMIN_PASSWORD}
    depends_on:
      - sme-operator
    volumes:
      - ./data:/app/data
    networks:
      - backend-net

  # ---------------------------------------------------------------------------
  # autoheal: Monitors HEALTHCHECK status and restarts unhealthy containers.
  # Docker's own restart policy only fires on container exit/crash, NOT when a
  # container transitions to "unhealthy".  autoheal bridges that gap.
  # ---------------------------------------------------------------------------
  autoheal:
    image: willfarrell/autoheal:latest
    container_name: autoheal
    restart: unless-stopped
    environment:
      - AUTOHEAL_CONTAINER_LABEL=all
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    networks:
      - backend-net

  sme-operator:
    build:
      context: .
      dockerfile: Dockerfile.operator
    container_name: sme_operator
    restart: unless-stopped
    ports:
      - "8000:8000"
    environment:
      - SME_DATA_DIR=/app/data
      # Security — values must be set in a .env file (see .env.example)
      - SME_GATEWAY_SECRET=${SME_GATEWAY_SECRET}
      - SME_ADMIN_PASSWORD=${SME_ADMIN_PASSWORD}
      - SME_HSM_SECRET=${SME_HSM_SECRET}
      - SME_CORS_ORIGINS=${SME_CORS_ORIGINS:-http://localhost:80,http://localhost:5173}
    depends_on:
      postgres:
        condition: service_healthy
    volumes:
      - ./src/sme/vendor:/app/src/sme/vendor
      - ./data:/app/data
    networks:
      - frontend-net
      - backend-net

  sme-frontend:
    build:
      context: .
      dockerfile: Dockerfile.frontend
    container_name: sme_frontend
    restart: unless-stopped
    ports:
      - "80:80"
    depends_on:
      - sme-operator
    networks:
      - frontend-net

  gpu-exporter:
    image: nvcr.io/nvidia/k8s/dcgm-exporter:latest
    container_name: gpu_exporter
    restart: unless-stopped
    command: ["-k"]
    ports:
      - "9400:9400"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks:
      - monitoring-net

  postgres:
    image: postgres:15-alpine
    container_name: sme_postgres
    restart: unless-stopped
    environment:
      POSTGRES_USER: ${POSTGRES_USER:-sme_user}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-sme_password}
      POSTGRES_DB: ${POSTGRES_DB:-sme_nexus}
    ports:
      - "5432:5432"
    volumes:
      - sme_pg_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-sme_user} -d ${POSTGRES_DB:-sme_nexus}"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    networks:
      - backend-net

  # ---------------------------------------------------------------------------
  # sme-backup: Automated PostgreSQL backup service
  # Runs daily backups at 2am and retains backups for 7 days
  # ---------------------------------------------------------------------------
  sme-backup:
    image: postgres:15-alpine
    container_name: sme_backup
    restart: unless-stopped
    environment:
      POSTGRES_HOST: postgres
      POSTGRES_PORT: 5432
      POSTGRES_USER: ${POSTGRES_USER:-sme_user}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-sme_password}
      POSTGRES_DB: ${POSTGRES_DB:-sme_nexus}
      SME_BACKUP_DIR: /app/backups
      SME_BACKUP_RETENTION_DAYS: 7
    volumes:
      - sme_backups:/app/backups
    command: >
      sh -c "
        echo 'Backup container initialized' &&
        # Run backup immediately then daily at 2am
        python scripts/backup_postgres.py || true &&
        while true; do
          sleep 86400 &&
          python scripts/backup_postgres.py --cleanup || true
        done
      "
    networks:
      - backend-net
    depends_on:
      - postgres

  # ---------------------------------------------------------------------------
  # Ollama - Local LLM runtime (Docker-based)
  # Accessible at ollama:11434 from other containers
  # Models stored in persistent volume
  # ---------------------------------------------------------------------------
  ollama:
    image: ollama/ollama:latest
    container_name: sme_ollama
    restart: unless-stopped
    ports:
      - "11434:11434"
    environment:
      - OLLAMA_HOST=0.0.0.0
    volumes:
      - ollama_models:/root/.ollama
    # Pull default model on first startup (comment out to skip)
    # To add more models: docker exec sme_ollama ollama pull <model>
    command: >
      sh -c "ollama pull llama3.2 || true && ollama serve"
    deploy:
      resources:
        limits:
          # Allocate sufficient memory for larger models
          memory: 8G
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks:
      - backend-net
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s

volumes:
  sme_pg_data:
  ollama_models:
  sme_backups:

# ---------------------------------------------------------------------------
# Named networks for service isolation.
# frontend-net : frontend ↔ operator only
# backend-net  : operator ↔ sidecar ↔ postgres ↔ autoheal
# monitoring-net: gpu-exporter (isolated from application traffic)
# ---------------------------------------------------------------------------
networks:
  frontend-net:
  backend-net:
  monitoring-net: