# NOTES: # # This Docker Compose (and the associated run.yaml) assumes you will be # running in the default "bridged" network mode. # # If you need "host" network mode, please uncomment # - network_mode: "host" # # Similarly change "host.docker.internal" to "localhost" in the run.yaml file # services: vllm-0: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface # network_mode: "host" ports: - "5100:5100" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=0 - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 --model meta-llama/Llama-3.1-8B-Instruct --enforce-eager --max-model-len 8192 --max-num-seqs 16 --port 5100 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia vllm-1: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface # network_mode: "host" ports: - "5101:5101" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=1 - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 --model meta-llama/Llama-Guard-3-1B --enforce-eager --max-model-len 8192 --max-num-seqs 16 --port 5101 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia llamastack: depends_on: - vllm-0 - vllm-1 # image: llamastack/distribution-remote-vllm image: llamastack/distribution-remote-vllm:test-0.0.52rc3 volumes: - ~/.llama:/root/.llama - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml # network_mode: "host" ports: - "5001:5001" # Hack: wait for vLLM server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001" deploy: restart_policy: condition: on-failure delay: 3s max_attempts: 5 window: 60s volumes: vllm-0: vllm-1: llamastack: