feat: switch to async completion in LiteLLM OpenAI mixin

- Replace synchronous litellm.completion() with async litellm.acompletion() - Update comment to reference official LiteLLM async completion docs - Improves async compatibility in the inference provider
2025-12-22 21:00:01 +00:00 · 2025-08-03 17:09:42 +03:00 · 2025-08-03 17:09:42 +03:00 · b1829e3497
commit b1829e3497
parent dbfc15123e
1 changed files with 3 additions and 4 deletions
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin(
        params["model"] = self.get_litellm_model_name(params["model"])

        logger.debug(f"params to litellm (openai compat): {params}")
-        # unfortunately, we need to use synchronous litellm.completion here because litellm
-        # caches various httpx.client objects in a non-eventloop aware manner
-        response = litellm.completion(**params)
+        # see https://docs.litellm.ai/docs/completion/stream#async-completion
+        response = await litellm.acompletion(**params)
        if stream:
            return self._stream_chat_completion(response)
        else:
@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin(
        self, response: litellm.ModelResponse
    ) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
        async def _stream_generator():
-            for chunk in response:
+            async for chunk in response:
                yield chunk

        async for chunk in convert_openai_chat_completion_stream(