# This sample Ray Service shows how to configure Ray Serve LLM for high
# throughput serving.
# This is enabled by the env vars
#
# - RAY_SERVE_ENABLE_HA_PROXY=1
# - RAY_SERVE_THROUGHPUT_OPTIMIZED=1
# - RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING=1
# - VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
#
# This example serves Gemma 4 E2B on NVIDIA B200s, as benchmarked with GKE.
apiVersion: ray.io/v1
kind: RayService
metadata:
  name: ray-gemma4-e2b-it
spec:
  serveConfigV2: |
    applications:
    - name: llm_app
      import_path: ray.serve.llm:build_openai_app
      route_prefix: "/"
      args:
        llm_configs:
          - model_loading_config:
              model_id: gemma4-e2b-it
              model_source: google/gemma-4-E2B-it
            accelerator_type: B200
            engine_kwargs:
              enable_auto_tool_choice: true
              # You may want to set a larger max model len to support caching
              # longer context windows if needed.
              max_model_len: 4096
              reasoning_parser: gemma4
              tensor_parallel_size: 1
              tool_call_parser: gemma4
            deployment_config:
              num_replicas: 8  # Should match the number of GPUs available.
  rayClusterConfig:
    rayVersion: 2.56.0
    headGroupSpec:
      rayStartParams:
        dashboard-host: 0.0.0.0
        num-cpus: '0'
      template:
        spec:
          containers:
          - name: ray-node
            image: rayproject/ray-llm:2.56.0-py312-cu130
            resources:
              requests:
                cpu: '4'
                memory: 16Gi
                ephemeral-storage: 32Gi
            ports:
            - containerPort: 6379
              name: gcs-server
            - containerPort: 8265
              name: dashboard
            - containerPort: 10001
              name: client
            - containerPort: 8000
              name: serve
            env:
            - name: RAY_SERVE_ENABLE_HA_PROXY
              value: '1'
            - name: RAY_SERVE_THROUGHPUT_OPTIMIZED
              value: '1'
            - name: RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING
              value: '1'
            - name: VLLM_USE_RAY_V2_EXECUTOR_BACKEND
              value: '1'
            - name: HUGGING_FACE_HUB_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-secret
                  key: hf_api_token
    workerGroupSpecs:
    - replicas: 1
      groupName: gpu-group
      rayStartParams: {}
      template:
        metadata:
        spec:
          containers:
          - name: llm
            image: rayproject/ray-llm:2.56.0-py312-cu130
            imagePullPolicy: Always
            env:
            - name: RAY_SERVE_ENABLE_HA_PROXY
              value: '1'
            - name: RAY_SERVE_THROUGHPUT_OPTIMIZED
              value: '1'
            - name: RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING
              value: '1'
            - name: VLLM_USE_RAY_V2_EXECUTOR_BACKEND
              value: '1'
            - name: HUGGING_FACE_HUB_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-secret
                  key: hf_api_token
            resources:
              limits:
                nvidia.com/gpu: '8'
              requests:
                nvidia.com/gpu: '8'
                cpu: '128'
                memory: 512Gi
                ephemeral-storage: 256Gi
            volumeMounts:
            - mountPath: /dev/shm
              name: dshm
          nodeSelector:
            cloud.google.com/gke-accelerator: nvidia-b200
          tolerations:
          - key: nvidia.com/gpu
            operator: Exists
            effect: NoSchedule
          volumes:
          - name: dshm
            emptyDir:
              medium: Memory