From b1829e3497c22ae86c86d2cd196b8f876fa26ab6 Mon Sep 17 00:00:00 2001
From: Eran Cohen <eranco@redhat.com>
Date: Sun, 3 Aug 2025 17:09:42 +0300
Subject: [PATCH] feat: switch to async completion in LiteLLM OpenAI mixin

  - Replace synchronous litellm.completion() with async litellm.acompletion()
  - Update comment to reference official LiteLLM async completion docs
  - Improves async compatibility in the inference provider
---
 .../providers/utils/inference/litellm_openai_mixin.py      | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index befb4b092..da2e634f6 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin(
         params["model"] = self.get_litellm_model_name(params["model"])
 
         logger.debug(f"params to litellm (openai compat): {params}")
-        # unfortunately, we need to use synchronous litellm.completion here because litellm
-        # caches various httpx.client objects in a non-eventloop aware manner
-        response = litellm.completion(**params)
+        # see https://docs.litellm.ai/docs/completion/stream#async-completion
+        response = await litellm.acompletion(**params)
         if stream:
             return self._stream_chat_completion(response)
         else:
@@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin(
         self, response: litellm.ModelResponse
     ) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
         async def _stream_generator():
-            for chunk in response:
+            async for chunk in response:
                 yield chunk
 
         async for chunk in convert_openai_chat_completion_stream(