From 2990303fd69e28fa4e24927f9adaafdc82d0334f Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 28 Oct 2024 16:10:36 -0700
Subject: [PATCH] dell tgi

---
 distributions/dell-tgi/compose.yaml | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml
index 791e7ec20..0e325aff5 100644
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@@ -9,16 +9,12 @@ services:
     devices:
       - nvidia.com/gpu=all
     environment:
-      - CUDA_VISIBLE_DEVICES=0
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
+      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
       - NUM_SHARD=4
       - MAX_BATCH_PREFILL_TOKENS=32768
       - MAX_INPUT_TOKENS=8000
       - MAX_TOTAL_TOKENS=8192
-    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
+    command: []
     deploy:
       resources:
         reservations:
@@ -26,14 +22,13 @@ services:
           - driver: nvidia
             # that's the closest analogue to --gpus; provide
             # an integer amount of devices or 'all'
-            count: 1
+            count: all
             # Devices are reserved using a list of capabilities, making
             # capabilities the only required field. A device MUST
             # satisfy all the requested capabilities for a successful
             # reservation.
             capabilities: [gpu]
     runtime: nvidia
-    shm_size: '1gb'
   llamastack:
     depends_on:
       text-generation-inference: