services: ${SERVICE_NAME:-tgi}: image: ghcr.io/huggingface/text-generation-inference:2.3.1 network_mode: "host" volumes: - $HOME/.cache/huggingface:/data ports: - ${TGI_PORT:-8000}:${TGI_PORT:-8000} devices: - nvidia.com/gpu=all environment: - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} - HF_HOME=/data - HF_DATASETS_CACHE=/data - HF_MODULES_CACHE=/data - HF_HUB_CACHE=/data command: > --dtype bfloat16 --usage-stats off --sharded false --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct} --port ${TGI_PORT:-8000} --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8} deploy: resources: reservations: devices: - driver: nvidia capabilities: [gpu] runtime: nvidia healthcheck: test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"] interval: 5s timeout: 5s retries: 30