# This is a RayCluster configuration for exploration of the 100Gi Ray AIR XGBoostTrainer benchmark. # The configuration includes 1 Ray head pod and 9 Ray worker pods. # Each Ray container requests 54 Gi memory and 14 CPU. # For underlying Kubernetes node configuration, we suggest a node group or pool with # the following features: # - 10 virtual machines # - 64 Gi memory and 16 CPU each # (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2) # - Each node should be configured with 1000 Gi of disk space (for data set storage). # One Ray pod will be scheduled per Kubernetes node. # The suggested gap between the Ray container resource requests and the K8s node's totals accounts # for K8s control processes and cloud-provider-specific daemons. # Optimal resource allocation will depend on your Kubernetes infrastructure and might # require some experimentation. # # A note on autoscaling: # If you wish to observe Ray autoscaling in this example, make the following modification: # to your Kubernetes configuration: # - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes. # Make the following changes to this configuration file: # 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration. # 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`. # Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory; # the config xgboost-benchmark-autoscaler.yaml already includes the above modifications. # * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically # scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload. # * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will # scale up to 9 nodes to accommodate the Ray pods. # # Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will # be scaled down. --- apiVersion: ray.io/v1alpha1 kind: RayCluster metadata: labels: controller-tools.k8s.io: "1.0" name: raycluster-xgboost-benchmark spec: # Uncomment the next line to experiment with autoscaling. # enableInTreeAutoscaling: true # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. rayVersion: '2.0.0' headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' serviceType: ClusterIP rayStartParams: dashboard-host: '0.0.0.0' block: 'true' template: spec: containers: # The Ray head container - name: ray-head image: rayproject/ray-ml:2.0.0 imagePullPolicy: Always # Optimal resource allocation will depend on your Kubernetes infrastructure and might # require some experimentation. # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal # resource accounting. K8s requests are not used by Ray. resources: limits: cpu: "14" memory: "54Gi" # The node that hosts this pod should have at least 1000Gi disk space, # for data set storage. ephemeral-storage: "700Gi" requests: cpu: "14" memory: "54Gi" ephemeral-storage: "700Gi" lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] workerGroupSpecs: - replicas: 9 minReplicas: 9 maxReplicas: 9 # To experiment with autoscaling, # set replicas and minReplicas to 0. # replicas: 0 # minReplicas: 0 groupName: large-group # the following params are used to complete the ray start: ray start --block rayStartParams: block: 'true' template: spec: containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc') image: rayproject/ray-ml:2.0.0 # Optimal resource allocation will depend on your Kubernetes infrastructure and might # require some experimentation. # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal # resource accounting. K8s requests are not used by Ray. resources: limits: # Slightly less than 16 to accomodate placement on 16 vCPU virtual machine. cpu: "14" memory: "54Gi" # The node that hosts this pod should have at least 1000Gi disk space, # for data set storage. ephemeral-storage: "700Gi" requests: cpu: "14" memory: "54Gi" ephemeral-storage: "700Gi" lifecycle: preStop: exec: command: ["/bin/sh","-c","ray stop"] # Waits for availability of the Ray head's GCS service. initContainers: # the env var $RAY_IP is set by the operator, with the value of the head service name - name: init-myservice image: busybox:1.28 command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]