fix: Don't cache clients for passthrough auth providers (#2728)

# What does this PR do? Some of our inference providers support passthrough authentication via `x-llamastack-provider-data` header values. This fixes the providers that support passthrough auth to not cache their clients to the backend providers (mostly OpenAI client instances) so that the client connecting to Llama Stack has to provide those auth values on each and every request. ## Test Plan I added some unit tests to ensure we're not caching clients across requests for all the fixed providers in this PR. ``` uv run pytest -sv tests/unit/providers/inference/test_inference_client_caching.py ``` I also ran some of our OpenAI compatible API integration tests for each of the changed providers, just to ensure they still work. Note that these providers don't actually pass all these tests (for unrelated reasons due to quirks of the Groq and Together SaaS services), but enough of the tests passed to confirm the clients are still working as intended. ### Together ``` ENABLE_TOGETHER="together" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "together/meta-llama/Llama-3.1-8B-Instruct" ``` ### OpenAI ``` ENABLE_OPENAI="openai" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "openai/gpt-4o-mini" ``` ### Groq ``` ENABLE_GROQ="groq" \ uv run llama stack run llama_stack/templates/starter/run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 \ uv run pytest -sv \ tests/integration/inference/test_openai_completion.py \ --text-model "groq/meta-llama/Llama-3.1-8B-Instruct" ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-10-04 04:04:14 +00:00 · 2025-07-11 16:38:27 -04:00 · 2025-07-11 16:38:27 -04:00 · 51d9fd4808
commit 51d9fd4808
parent aa2595c7c3
6 changed files with 196 additions and 45 deletions
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -68,19 +68,12 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    def __init__(self, config: TogetherImplConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self.config = config
-        self._client = None
-        self._openai_client = None

    async def initialize(self) -> None:
        pass

    async def shutdown(self) -> None:
-        if self._client:
-            # Together client has no close method, so just set to None
-            self._client = None
-        if self._openai_client:
-            await self._openai_client.close()
-            self._openai_client = None
+        pass

    async def completion(
        self,
@ -108,29 +101,25 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            return await self._nonstream_completion(request)

    def _get_client(self) -> AsyncTogether:
-        if not self._client:
-            together_api_key = None
-            config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
-            if config_api_key:
-                together_api_key = config_api_key
-            else:
-                provider_data = self.get_request_provider_data()
-                if provider_data is None or not provider_data.together_api_key:
-                    raise ValueError(
-                        'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
-                    )
-                together_api_key = provider_data.together_api_key
-            self._client = AsyncTogether(api_key=together_api_key)
-        return self._client
+        together_api_key = None
+        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
+        if config_api_key:
+            together_api_key = config_api_key
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.together_api_key:
+                raise ValueError(
+                    'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
+                )
+            together_api_key = provider_data.together_api_key
+        return AsyncTogether(api_key=together_api_key)

    def _get_openai_client(self) -> AsyncOpenAI:
-        if not self._openai_client:
-            together_client = self._get_client().client
-            self._openai_client = AsyncOpenAI(
-                base_url=together_client.base_url,
-                api_key=together_client.api_key,
-            )
-        return self._openai_client
+        together_client = self._get_client().client
+        return AsyncOpenAI(
+            base_url=together_client.base_url,
+            api_key=together_client.api_key,
+        )

    async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)