services: vllm-inference: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface network_mode: ${NETWORK_MODE:-bridged} ports: - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0} - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} --enforce-eager --max-model-len 8192 --max-num-seqs 16 --port ${VLLM_INFERENCE_PORT:-5100} healthcheck: test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"] interval: 30s timeout: 10s retries: 5 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia # A little trick: # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model # otherwise, the entry will end in a hyphen which gets ignored by docker compose vllm-${VLLM_SAFETY_MODEL:+safety}: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface network_mode: ${NETWORK_MODE:-bridged} ports: - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1} - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 --model ${VLLM_SAFETY_MODEL} --enforce-eager --max-model-len 8192 --max-num-seqs 16 --port ${VLLM_SAFETY_PORT:-5101} healthcheck: test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"] interval: 30s timeout: 10s retries: 5 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia llamastack: depends_on: - vllm-inference: condition: service_healthy - vllm-${VLLM_SAFETY_MODEL:+safety}: condition: service_healthy image: llamastack/distribution-remote-vllm:test-0.0.52rc3 volumes: - ~/.llama:/root/.llama - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml network_mode: ${NETWORK_MODE:-bridged} environment: - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1 - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1 - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - MAX_TOKENS=${MAX_TOKENS:-4096} - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm} - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} ports: - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}" # Hack: wait for vLLM server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321" deploy: restart_policy: condition: on-failure delay: 3s max_attempts: 5 window: 60s volumes: vllm-inference: vllm-safety: llamastack: