mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
100 lines
3.3 KiB
YAML
100 lines
3.3 KiB
YAML
services:
|
|
vllm-inference:
|
|
image: vllm/vllm-openai:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
command: >
|
|
--gpu-memory-utilization 0.75
|
|
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--enforce-eager
|
|
--max-model-len 8192
|
|
--max-num-seqs 16
|
|
--port ${VLLM_INFERENCE_PORT:-5100}
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
# A little trick:
|
|
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
|
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
|
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
image: vllm/vllm-openai:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
command: >
|
|
--gpu-memory-utilization 0.75
|
|
--model ${VLLM_SAFETY_MODEL}
|
|
--enforce-eager
|
|
--max-model-len 8192
|
|
--max-num-seqs 16
|
|
--port ${VLLM_SAFETY_PORT:-5101}
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
llamastack:
|
|
depends_on:
|
|
- vllm-inference:
|
|
condition: service_healthy
|
|
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
condition: service_healthy
|
|
# image: llamastack/distribution-remote-vllm
|
|
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
|
volumes:
|
|
- ~/.llama:/root/.llama
|
|
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
environment:
|
|
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
|
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
|
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
ports:
|
|
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
|
|
# Hack: wait for vLLM server to start before starting docker
|
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
|
deploy:
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 3s
|
|
max_attempts: 5
|
|
window: 60s
|
|
volumes:
|
|
vllm-inference:
|
|
vllm-safety:
|
|
llamastack:
|