diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 010f0c5d6..7f8c9d8ab 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -128,8 +128,10 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): fmt: ResponseFormat = None, ): options = get_sampling_options(sampling_params) - # TGI does not support temperature=0, so we set it to 1e-3 instead - if options["temperature"] == 0: + # TGI does not support temperature=0 when using greedy sampling + # We set it to 1e-3 instead, anything lower outputs garbage from TGI + # We can use top_p sampling strategy to specify lower temperature + if abs(options["temperature"]) < 1e-10: options["temperature"] = 1e-3 # delete key "max_tokens" from options since its not supported by the API