llama-stack/distributions/remote-vllm/compose.yaml
2024-11-12 15:43:30 -08:00

87 lines
2.3 KiB
YAML

# NOTES:
#
# This Docker Compose (and the associated run.yaml) assumes you will be
# running in the default "bridged" network mode.
#
# If you need "host" network mode, please uncomment
# - network_mode: "host"
#
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
#
services:
vllm-0:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
ports:
- "5100:5100"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-3.1-8B-Instruct
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5100
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
vllm-1:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host"
ports:
- "5101:5101"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=1
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model meta-llama/Llama-Guard-3-1B
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port 5101
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
- vllm-0
- vllm-1
# image: llamastack/distribution-remote-vllm
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
volumes:
- ~/.llama:/root/.llama
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
# network_mode: "host"
ports:
- "5001:5001"
# Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
vllm-0:
vllm-1:
llamastack: