fix tgi

2025-08-05 18:22:41 +00:00 · 2025-01-16 16:46:49 -08:00 · 2025-01-16 16:46:49 -08:00 · 5c6e1e9d1e
commit 5c6e1e9d1e
parent e88faa91e2
3 changed files with 10 additions and 5 deletions
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -15,10 +15,10 @@ class TGIImplConfig(BaseModel):
    url: str = Field(
        description="The URL for the TGI serving endpoint",
    )
-    api_token: Optional[SecretStr] = Field(
-        default=None,
-        description="A bearer token if your TGI endpoint is protected.",
-    )
+    # api_token: Optional[SecretStr] = Field(
+    #     default=None,
+    #     description="A bearer token if your TGI endpoint is protected.",
+    # )

    @classmethod
    def sample_run_config(cls, url: str = "${env.TGI_URL}", **kwargs):
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -128,6 +128,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        fmt: ResponseFormat = None,
    ):
        options = get_sampling_options(sampling_params)
+        if options["temperature"] == 0:
+            options["temperature"] = 0.1
+
        # delete key "max_tokens" from options since its not supported by the API
        options.pop("max_tokens", None)
        if fmt:
@ -230,6 +233,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        self, request: ChatCompletionRequest
    ) -> ChatCompletionResponse:
        params = await self._get_params(request)
+        print("TGI params", params)
        r = await self.client.text_generation(**params)

        choice = OpenAICompatCompletionChoice(
@ -289,7 +293,7 @@ class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
        log.info(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(
-            model=config.url, token=config.api_token.get_secret_value()
+            model=config.url,
        )
        endpoint_info = await self.client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
--- a/tests/client-sdk/inference/test_inference.py
+++ b/tests/client-sdk/inference/test_inference.py
@ -225,6 +225,7 @@ def test_text_chat_completion_with_tool_calling_and_non_streaming(
        tool_prompt_format=provider_tool_format,
        stream=False,
    )
+    print(response)
    # No content is returned for the system message since we expect the
    # response to be a tool call
    assert response.completion_message.content == ""