# This sample Ray Service shows how to configure Ray Serve LLM for high # throughput serving. # This is enabled by the env vars # # - RAY_SERVE_ENABLE_HA_PROXY=1 # - RAY_SERVE_THROUGHPUT_OPTIMIZED=1 # - RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING=1 # - VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 # # ...which should be set on both the head and workers. # This example uses L4s on GKE with Qwen 3.5 for illustrative purposes only. # The high throughput routing can be used with any model, accelerators, # resources, and workloads. apiVersion: ray.io/v1 kind: RayService metadata: name: qwen-35-4b spec: serveConfigV2: | applications: - name: llm_app import_path: ray.serve.llm:build_openai_app args: llm_configs: - model_loading_config: model_id: qwen-3.5-4b model_source: Qwen/Qwen3.5-4b accelerator_type: L4 deployment_config: num_replicas: 4 rayClusterConfig: rayVersion: 2.56.0 headGroupSpec: rayStartParams: dashboard-host: 0.0.0.0 num-cpus: '0' template: spec: containers: - name: ray-head image: rayproject/ray-llm:2.56.0-py312-cu130 resources: limits: memory: 8Gi ephemeral-storage: 32Gi requests: cpu: '2' memory: 8Gi ephemeral-storage: 32Gi ports: - containerPort: 6379 name: gcs-server - containerPort: 8265 name: dashboard - containerPort: 10001 name: client - containerPort: 8000 name: serve env: - name: RAY_SERVE_ENABLE_HA_PROXY value: '1' - name: RAY_SERVE_THROUGHPUT_OPTIMIZED value: '1' - name: RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING value: '1' - name: VLLM_USE_RAY_V2_EXECUTOR_BACKEND value: '1' - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token workerGroupSpecs: - replicas: 4 groupName: gpu-group rayStartParams: {} template: spec: containers: - name: llm image: rayproject/ray-llm:2.56.0-py312-cu130 env: - name: RAY_SERVE_ENABLE_HA_PROXY value: '1' - name: RAY_SERVE_THROUGHPUT_OPTIMIZED value: '1' - name: RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING value: '1' - name: VLLM_USE_RAY_V2_EXECUTOR_BACKEND value: '1' - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token resources: limits: nvidia.com/gpu: '1' ephemeral-storage: 24Gi requests: cpu: '6' memory: 24Gi nvidia.com/gpu: '1' ephemeral-storage: 24Gi nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4