forked from phoenix-oss/llama-stack-mirror
# What does this PR do? Rename environment var for consistency ## Test Plan No regressions ## Sources ## Before submitting - [X] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [X] Ran pre-commit to handle lint / formatting issues. - [X] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [X] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --------- Signed-off-by: Yuan Tang <terrytangyuan@gmail.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
100 lines
3.3 KiB
YAML
100 lines
3.3 KiB
YAML
services:
|
|
vllm-inference:
|
|
image: vllm/vllm-openai:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
command: >
|
|
--gpu-memory-utilization 0.75
|
|
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--enforce-eager
|
|
--max-model-len 8192
|
|
--max-num-seqs 16
|
|
--port ${VLLM_INFERENCE_PORT:-5100}
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
# A little trick:
|
|
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
|
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
|
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
image: vllm/vllm-openai:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
command: >
|
|
--gpu-memory-utilization 0.75
|
|
--model ${VLLM_SAFETY_MODEL}
|
|
--enforce-eager
|
|
--max-model-len 8192
|
|
--max-num-seqs 16
|
|
--port ${VLLM_SAFETY_PORT:-5101}
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
llamastack:
|
|
depends_on:
|
|
- vllm-inference:
|
|
condition: service_healthy
|
|
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
condition: service_healthy
|
|
# image: llamastack/distribution-remote-vllm
|
|
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
|
volumes:
|
|
- ~/.llama:/root/.llama
|
|
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
environment:
|
|
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
|
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
|
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
ports:
|
|
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
|
# Hack: wait for vLLM server to start before starting docker
|
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
|
deploy:
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 3s
|
|
max_attempts: 5
|
|
window: 60s
|
|
volumes:
|
|
vllm-inference:
|
|
vllm-safety:
|
|
llamastack:
|