services: mysql: image: mariadb:12 hostname: mysql container_name: mysql environment: MYSQL_RANDOM_ROOT_PASSWORD: "yes" MYSQL_DATABASE: ${MYSQL_DATABASE:-slurm_acct_db} MYSQL_USER: ${MYSQL_USER:-slurm} MYSQL_PASSWORD: ${MYSQL_PASSWORD:-password} volumes: - var_lib_mysql:/var/lib/mysql networks: - slurm-network healthcheck: test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"] interval: 10s timeout: 5s retries: 3 start_period: 30s slurmdbd: image: slurm-docker-cluster:${SLURM_VERSION:-25.11.4} build: context: . args: SLURM_VERSION: ${SLURM_VERSION:-25.11.4} LMOD_VERSION: ${LMOD_VERSION:-9.1.2} SPACK_VERSION: ${SPACK_VERSION:-v1.1.1} GPU_ENABLE: ${GPU_ENABLE:-false} BUILDER_BASE: ${BUILDER_BASE:-rockylinux/rockylinux:9} RUNTIME_BASE: ${RUNTIME_BASE:-rockylinux/rockylinux:9} cache_from: - slurm-docker-cluster:${SLURM_VERSION:-25.11.4} command: ["slurmdbd"] container_name: slurmdbd hostname: slurmdbd environment: MYSQL_USER: ${MYSQL_USER:-slurm} MYSQL_PASSWORD: ${MYSQL_PASSWORD:-password} volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - var_log_slurm:/var/log/slurm expose: - "6819" depends_on: mysql: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "pidof slurmdbd"] interval: 10s timeout: 5s retries: 3 start_period: 20s slurmctld: image: slurm-docker-cluster:${SLURM_VERSION:-25.11.4} command: ["slurmctld"] container_name: slurmctld hostname: slurmctld privileged: true working_dir: /data environment: ELASTICSEARCH_HOST: ${ELASTICSEARCH_HOST:-} SSH_ENABLE: ${SSH_ENABLE:-false} volumes: - etc_munge:/etc/munge:z - etc_slurm:/etc/slurm:z - slurm_jobdir:/data:z - var_log_slurm:/var/log/slurm:z - opt_modulefiles:/opt/modulefiles - spack_root:/opt/spack - ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z ports: - "${SSH_PORT:-3022}:22" expose: - "6817" depends_on: slurmdbd: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "scontrol ping"] interval: 10s timeout: 5s retries: 3 start_period: 20s slurmrestd: image: slurm-docker-cluster:${SLURM_VERSION:-25.11.4} command: ["slurmrestd"] container_name: slurmrestd hostname: slurmrestd privileged: true volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - var_log_slurm:/var/log/slurm ports: - "6820:6820" expose: - "6820" depends_on: slurmctld: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "test -S /var/run/slurmrestd/slurmrestd.socket"] interval: 10s timeout: 5s retries: 3 start_period: 20s cpu-worker: image: slurm-docker-cluster:${SLURM_VERSION:-25.11.4} command: ["slurmd-cpu"] working_dir: /data privileged: true environment: COMPOSE_PROJECT_NAME: ${COMPOSE_PROJECT_NAME:-slurm} volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - slurm_jobdir:/data - var_log_slurm:/var/log/slurm - home_ood:/home/ood # shared with OOD for job I/O - opt_modulefiles:/opt/modulefiles - spack_root:/opt/spack expose: - "6818" depends_on: slurmctld: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "pidof slurmd"] interval: 10s timeout: 5s retries: 3 start_period: 20s deploy: replicas: ${CPU_WORKER_COUNT:-2} # GPU compute node (NVIDIA) # Requires: GPU_ENABLE=true in .env and nvidia-container-toolkit installed on host gpu-worker: image: slurm-docker-cluster:${SLURM_VERSION:-25.11.4} command: ["slurmd-gpu"] working_dir: /data privileged: true profiles: ["gpu"] environment: COMPOSE_PROJECT_NAME: ${COMPOSE_PROJECT_NAME:-slurm} volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - slurm_jobdir:/data - var_log_slurm:/var/log/slurm - home_ood:/home/ood # shared with OOD for job I/O - opt_modulefiles:/opt/modulefiles - spack_root:/opt/spack expose: - "6818" depends_on: slurmctld: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "pidof slurmd"] interval: 10s timeout: 5s retries: 3 start_period: 20s deploy: replicas: ${GPU_WORKER_COUNT:-1} resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu, compute, utility] # Open OnDemand web portal (OPTIONAL) # Start with: docker compose --profile ondemand up -d # or set OOD_ENABLE=true in .env ondemand: build: context: . dockerfile: Dockerfile.ondemand args: SLURM_VERSION: ${SLURM_VERSION:-25.11.4} RUNTIME_BASE: ${RUNTIME_BASE:-rockylinux/rockylinux:9} profiles: ["ondemand"] container_name: ondemand hostname: ondemand volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - slurm_jobdir:/data - home_ood:/home/ood ports: - "${OOD_PORT:-8080}:8080" depends_on: slurmctld: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:8080/ || exit 1"] interval: 10s timeout: 5s retries: 5 start_period: 30s # Optional Elasticsearch for job completion logging # Start with: docker compose --profile monitoring up -d # or with: make up-with-monitoring elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:9.3.0 profiles: ["monitoring"] container_name: elasticsearch hostname: elasticsearch environment: - discovery.type=single-node - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" volumes: - elasticsearch_data:/usr/share/elasticsearch/data expose: - "9200" networks: - slurm-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9200/_cluster/health"] interval: 10s timeout: 5s retries: 5 start_period: 30s # Kibana for visualizing Elasticsearch job completions kibana: image: docker.elastic.co/kibana/kibana:9.3.0 profiles: ["monitoring"] container_name: kibana hostname: kibana environment: - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 ports: - "5601:5601" depends_on: elasticsearch: condition: service_healthy networks: - slurm-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:5601/api/status"] interval: 10s timeout: 5s retries: 5 start_period: 60s volumes: etc_munge: etc_slurm: slurm_jobdir: var_lib_mysql: var_log_slurm: home_ood: opt_modulefiles: spack_root: elasticsearch_data: networks: slurm-network: driver: bridge