services:
  ${SERVICE_NAME:-tgi}:
    image: ghcr.io/huggingface/text-generation-inference:2.3.1
    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
    ports:
      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --port ${TGI_PORT:-8000}
      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
      interval: 5s
      timeout: 5s
      retries: 30