services: tgi-inference: image: ghcr.io/huggingface/text-generation-inference:latest volumes: - $HOME/.cache/huggingface:/data network_mode: ${NETWORK_MODE:-bridged} ports: - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0} - HF_TOKEN=$HF_TOKEN - HF_HOME=/data - HF_DATASETS_CACHE=/data - HF_MODULES_CACHE=/data - HF_HUB_CACHE=/data command: > --dtype bfloat16 --usage-stats off --sharded false --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} --port ${TGI_INFERENCE_PORT:-8080} --cuda-memory-fraction 0.75 healthcheck: test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"] interval: 5s timeout: 5s retries: 30 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia tgi-${TGI_SAFETY_MODEL:+safety}: image: ghcr.io/huggingface/text-generation-inference:latest volumes: - $HOME/.cache/huggingface:/data network_mode: ${NETWORK_MODE:-bridged} ports: - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}" devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1} - HF_TOKEN=$HF_TOKEN - HF_HOME=/data - HF_DATASETS_CACHE=/data - HF_MODULES_CACHE=/data - HF_HUB_CACHE=/data command: > --dtype bfloat16 --usage-stats off --sharded false --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} --port ${TGI_SAFETY_PORT:-8081} --cuda-memory-fraction 0.75 healthcheck: test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"] interval: 5s timeout: 5s retries: 30 deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia llamastack: depends_on: tgi-inference: condition: service_healthy tgi-${TGI_SAFETY_MODEL:+safety}: condition: service_healthy image: llamastack/distribution-tgi:test-0.0.52rc3 network_mode: ${NETWORK_MODE:-bridged} volumes: - ~/.llama:/root/.llama - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml ports: - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}" # Hack: wait for TGI server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" restart_policy: condition: on-failure delay: 3s max_attempts: 5 window: 60s environment: - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080} - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081} - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} volumes: tgi-inference: tgi-safety: llamastack: