llama-stack-mirror/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
2024-11-16 21:48:34 -08:00

35 lines
1,006 B
YAML

services:
${SERVICE_NAME:-tgi}:
image: ghcr.io/huggingface/text-generation-inference:2.3.1
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_PORT:-8000}
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
interval: 5s
timeout: 5s
retries: 30