diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 363bc4299..010f0c5d6 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -128,8 +128,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): fmt: ResponseFormat = None, ): options = get_sampling_options(sampling_params) + # TGI does not support temperature=0, so we set it to 1e-3 instead if options["temperature"] == 0: - options["temperature"] = 0.1 + options["temperature"] = 1e-3 # delete key "max_tokens" from options since its not supported by the API options.pop("max_tokens", None) @@ -233,7 +234,6 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): self, request: ChatCompletionRequest ) -> ChatCompletionResponse: params = await self._get_params(request) - print("TGI params", params) r = await self.client.text_generation(**params) choice = OpenAICompatCompletionChoice(