From b194fed28d7c213374da595a5e4db8da44a75d18 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 16 Jan 2025 17:38:22 -0800
Subject: [PATCH] more robust 0 check

---
 llama_stack/providers/remote/inference/tgi/tgi.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 010f0c5d6..7f8c9d8ab 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -128,8 +128,10 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
         fmt: ResponseFormat = None,
     ):
         options = get_sampling_options(sampling_params)
-        # TGI does not support temperature=0, so we set it to 1e-3 instead
-        if options["temperature"] == 0:
+        # TGI does not support temperature=0 when using greedy sampling
+        # We set it to 1e-3 instead, anything lower outputs garbage from TGI
+        # We can use top_p sampling strategy to specify lower temperature
+        if abs(options["temperature"]) < 1e-10:
             options["temperature"] = 1e-3
 
         # delete key "max_tokens" from options since its not supported by the API