Convert TGI

2025-12-22 03:22:25 +00:00 · 2024-11-17 14:49:41 -08:00 · 2024-11-17 14:49:41 -08:00 · 028530546f
commit 028530546f
parent 9bb07ce298
14 changed files with 485 additions and 160 deletions
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,51 +1,89 @@
 services:
-  text-generation-inference:
+  tgi-inference:
    image: ghcr.io/huggingface/text-generation-inference:latest
-    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5009:5009"
+       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
+      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
-    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_INFERENCE_PORT:-8080}
+      --cuda-memory-fraction 0.75
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
            capabilities: [gpu]
    runtime: nvidia
+
+  tgi-${TGI_SAFETY_MODEL:+safety}:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    network_mode: ${NETWORK_MODE:-bridged}
+    ports:
+       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
+      - HF_TOKEN=$HF_TOKEN
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+      --port ${TGI_SAFETY_PORT:-8081}
+      --cuda-memory-fraction 0.75
    healthcheck:
-      test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
+      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+
  llamastack:
    depends_on:
-      text-generation-inference:
+      tgi-inference:
        condition: service_healthy
-    image: llamastack/distribution-tgi
-    network_mode: "host"
+      tgi-${TGI_SAFETY_MODEL:+safety}:
+        condition: service_healthy
+    image: llamastack/distribution-tgi:test-0.0.52rc3
+    network_mode: ${NETWORK_MODE:-bridged}
    volumes:
      - ~/.llama:/root/.llama
-      # Link to TGI run.yaml file
-      - ./run.yaml:/root/my-run.yaml
+      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
@ -53,3 +91,13 @@ services:
      delay: 3s
      max_attempts: 5
      window: 60s
+    environment:
+      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
+      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+volumes:
+  tgi-inference:
+  tgi-safety:
+  llamastack: