name: vllm x-hf-cache: &hf_cache - ${HF_CACHE_DIR:-${HOME}/.cache/hf-cache}:/root/.cache/huggingface networks: appnet: {} services: vllm: image: vllm/vllm-openai:latest command: > --model ${VLLM_MODEL} --host 0.0.0.0 --port 8000 --gpu-memory-utilization ${VLLM_GPU_UTIL} --max-model-len 4096 --dtype auto --trust-remote-code # the model-max-len adde due to OOMs on a 1b model and its' kv-cache shit ipc: host environment: HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN} volumes: *hf_cache # Publish on loopback so Tailscale Serve can proxy to it ports: - "127.0.0.1:8000:8000" deploy: resources: reservations: devices: - driver: nvidia device_ids: ["1"] #device_ids: ["0", "1"] # dual capabilities: [gpu] networks: [appnet] restart: unless-stopped # TODO(evan): use that whatever vllm option that will release gpu mem while idle