From 2990303fd69e28fa4e24927f9adaafdc82d0334f Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 28 Oct 2024 16:10:36 -0700 Subject: [PATCH] dell tgi --- distributions/dell-tgi/compose.yaml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml index 791e7ec20..0e325aff5 100644 --- a/distributions/dell-tgi/compose.yaml +++ b/distributions/dell-tgi/compose.yaml @@ -9,16 +9,12 @@ services: devices: - nvidia.com/gpu=all environment: - - CUDA_VISIBLE_DEVICES=0 - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data + - CUDA_VISIBLE_DEVICES=0,1,2,3,4 - NUM_SHARD=4 - MAX_BATCH_PREFILL_TOKENS=32768 - MAX_INPUT_TOKENS=8000 - MAX_TOTAL_TOKENS=8192 - command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] + command: [] deploy: resources: reservations: @@ -26,14 +22,13 @@ services: - driver: nvidia # that's the closest analogue to --gpus; provide # an integer amount of devices or 'all' - count: 1 + count: all # Devices are reserved using a list of capabilities, making # capabilities the only required field. A device MUST # satisfy all the requested capabilities for a successful # reservation. capabilities: [gpu] runtime: nvidia - shm_size: '1gb' llamastack: depends_on: text-generation-inference: