From b1829e3497c22ae86c86d2cd196b8f876fa26ab6 Mon Sep 17 00:00:00 2001 From: Eran Cohen Date: Sun, 3 Aug 2025 17:09:42 +0300 Subject: [PATCH] feat: switch to async completion in LiteLLM OpenAI mixin - Replace synchronous litellm.completion() with async litellm.acompletion() - Update comment to reference official LiteLLM async completion docs - Improves async compatibility in the inference provider --- .../providers/utils/inference/litellm_openai_mixin.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index befb4b092..da2e634f6 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin( params["model"] = self.get_litellm_model_name(params["model"]) logger.debug(f"params to litellm (openai compat): {params}") - # unfortunately, we need to use synchronous litellm.completion here because litellm - # caches various httpx.client objects in a non-eventloop aware manner - response = litellm.completion(**params) + # see https://docs.litellm.ai/docs/completion/stream#async-completion + response = await litellm.acompletion(**params) if stream: return self._stream_chat_completion(response) else: @@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin( self, response: litellm.ModelResponse ) -> AsyncIterator[ChatCompletionResponseStreamChunk]: async def _stream_generator(): - for chunk in response: + async for chunk in response: yield chunk async for chunk in convert_openai_chat_completion_stream(