mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 09:22:36 +00:00
Adding docker-compose.yaml, starting to simplify
This commit is contained in:
parent
e4509cb568
commit
f38e76ee98
14 changed files with 516 additions and 386 deletions
|
|
@ -0,0 +1,35 @@
|
|||
services:
|
||||
${SERVICE_NAME:-tgi}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.3.1
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
ports:
|
||||
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_PORT:-8000}
|
||||
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
Loading…
Add table
Add a link
Reference in a new issue