name: vllm

x-hf-cache: &hf_cache
  - ${HF_CACHE_DIR:-${HOME}/.cache/hf-cache}:/root/.cache/huggingface

networks:
  appnet: {}

services:
  vllm:
    image: vllm/vllm-openai:latest
    command: >
      --model ${VLLM_MODEL}
      --host 0.0.0.0 --port 8000
      --gpu-memory-utilization ${VLLM_GPU_UTIL}
      --max-model-len 4096
      --dtype auto
      --trust-remote-code
    # the model-max-len adde due to OOMs on a 1b model and its' kv-cache shit
    ipc: host
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
    volumes: *hf_cache
    # Publish on loopback so Tailscale Serve can proxy to it
    ports:
      - "127.0.0.1:8000:8000"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["1"]
              #device_ids: ["0", "1"]  # dual
              capabilities: [gpu]
    networks: [appnet]
    restart: unless-stopped  # TODO(evan): use that whatever vllm option that will release gpu mem while idle