Adding docker-compose.yaml, starting to simplify

2025-12-17 09:22:36 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
@ -0,0 +1,35 @@
+services:
+  ${SERVICE_NAME:-tgi}:
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_PORT:-8000}
+      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30