dell tgi

2025-10-15 06:37:58 +00:00 · 2024-10-28 16:10:36 -07:00 · 2024-10-28 16:10:36 -07:00 · 2990303fd6
commit 2990303fd6
parent 859f01c604
1 changed files with 3 additions and 8 deletions
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -9,16 +9,12 @@ services:
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
+      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
      - NUM_SHARD=4
      - MAX_BATCH_PREFILL_TOKENS=32768
      - MAX_INPUT_TOKENS=8000
      - MAX_TOTAL_TOKENS=8192
-    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+    command: []
    deploy:
      resources:
        reservations:
@ -26,14 +22,13 @@ services:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
-            count: 1
+            count: all
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
-    shm_size: '1gb'
  llamastack:
    depends_on:
      text-generation-inference: