mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-16 21:12:38 +00:00
35 lines
1,006 B
YAML
35 lines
1,006 B
YAML
services:
|
|
${SERVICE_NAME:-tgi}:
|
|
image: ghcr.io/huggingface/text-generation-inference:2.3.1
|
|
network_mode: "host"
|
|
volumes:
|
|
- $HOME/.cache/huggingface:/data
|
|
ports:
|
|
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
|
|
devices:
|
|
- nvidia.com/gpu=all
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
|
- HF_HOME=/data
|
|
- HF_DATASETS_CACHE=/data
|
|
- HF_MODULES_CACHE=/data
|
|
- HF_HUB_CACHE=/data
|
|
command: >
|
|
--dtype bfloat16
|
|
--usage-stats off
|
|
--sharded false
|
|
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
--port ${TGI_PORT:-8000}
|
|
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
capabilities: [gpu]
|
|
runtime: nvidia
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 30
|