mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
103 lines
3.1 KiB
YAML
103 lines
3.1 KiB
YAML
services:
|
|
tgi-inference:
|
|
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/data
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
|
- HF_TOKEN=$HF_TOKEN
|
|
- HF_HOME=/data
|
|
- HF_DATASETS_CACHE=/data
|
|
- HF_MODULES_CACHE=/data
|
|
- HF_HUB_CACHE=/data
|
|
command: >
|
|
--dtype bfloat16
|
|
--usage-stats off
|
|
--sharded false
|
|
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--port ${TGI_INFERENCE_PORT:-8080}
|
|
--cuda-memory-fraction 0.75
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/data
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
|
- HF_TOKEN=$HF_TOKEN
|
|
- HF_HOME=/data
|
|
- HF_DATASETS_CACHE=/data
|
|
- HF_MODULES_CACHE=/data
|
|
- HF_HUB_CACHE=/data
|
|
command: >
|
|
--dtype bfloat16
|
|
--usage-stats off
|
|
--sharded false
|
|
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
--port ${TGI_SAFETY_PORT:-8081}
|
|
--cuda-memory-fraction 0.75
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
llamastack:
|
|
depends_on:
|
|
tgi-inference:
|
|
condition: service_healthy
|
|
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
condition: service_healthy
|
|
image: llamastack/distribution-tgi:test-0.0.52rc3
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
volumes:
|
|
- ~/.llama:/root/.llama
|
|
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
ports:
|
|
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
|
# Hack: wait for TGI server to start before starting docker
|
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 3s
|
|
max_attempts: 5
|
|
window: 60s
|
|
environment:
|
|
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
|
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
|
|
volumes:
|
|
tgi-inference:
|
|
tgi-safety:
|
|
llamastack:
|