forked from phoenix-oss/llama-stack-mirror
103 lines
3.1 KiB
YAML
103 lines
3.1 KiB
YAML
services:
|
|
tgi-inference:
|
|
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/data
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
|
- HF_TOKEN=$HF_TOKEN
|
|
- HF_HOME=/data
|
|
- HF_DATASETS_CACHE=/data
|
|
- HF_MODULES_CACHE=/data
|
|
- HF_HUB_CACHE=/data
|
|
command: >
|
|
--dtype bfloat16
|
|
--usage-stats off
|
|
--sharded false
|
|
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--port ${TGI_INFERENCE_PORT:-8080}
|
|
--cuda-memory-fraction 0.75
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/data
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
ports:
|
|
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
|
- HF_TOKEN=$HF_TOKEN
|
|
- HF_HOME=/data
|
|
- HF_DATASETS_CACHE=/data
|
|
- HF_MODULES_CACHE=/data
|
|
- HF_HUB_CACHE=/data
|
|
command: >
|
|
--dtype bfloat16
|
|
--usage-stats off
|
|
--sharded false
|
|
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
--port ${TGI_SAFETY_PORT:-8081}
|
|
--cuda-memory-fraction 0.75
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
|
|
llamastack:
|
|
depends_on:
|
|
tgi-inference:
|
|
condition: service_healthy
|
|
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
condition: service_healthy
|
|
image: llamastack/distribution-tgi:test-0.0.52rc3
|
|
network_mode: ${NETWORK_MODE:-bridged}
|
|
volumes:
|
|
- ~/.llama:/root/.llama
|
|
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
ports:
|
|
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
# Hack: wait for TGI server to start before starting docker
|
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 3s
|
|
max_attempts: 5
|
|
window: 60s
|
|
environment:
|
|
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
|
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
|
|
volumes:
|
|
tgi-inference:
|
|
tgi-safety:
|
|
llamastack:
|