convert blocking calls to async

Signed-off-by: Jaideep Rao <jrao@redhat.com>
2026-01-06 22:49:05 +00:00 · 2025-03-14 13:36:27 -04:00 · 2025-03-14 13:36:27 -04:00 · 66412ab12b
commit 66412ab12b
parent 5403582582
3 changed files with 23 additions and 18 deletions
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -112,9 +112,9 @@ class LiteLLMOpenAIMixin(

        params = await self._get_params(request)
        logger.debug(f"params to litellm (openai compat): {params}")
-        # unfortunately, we need to use synchronous litellm.completion here because litellm
-        # caches various httpx.client objects in a non-eventloop aware manner
-        response = litellm.completion(**params)
+        # Litellm seems to have implemented an async completion method
+        # https://docs.litellm.ai/docs/completion/stream#async-completion
+        response = await litellm.acompletion(**params)
        if stream:
            return self._stream_chat_completion(response)
        else:
@ -124,7 +124,7 @@ class LiteLLMOpenAIMixin(
        self, response: litellm.ModelResponse
    ) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
        async def _stream_generator():
-            for chunk in response:
+            async for chunk in response:
                yield chunk

        async for chunk in convert_openai_chat_completion_stream(
@ -223,10 +223,10 @@ class LiteLLMOpenAIMixin(
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)

-        response = litellm.embedding(
+        response = await litellm.embedding(
            model=model.provider_resource_id,
            input=[interleaved_content_as_str(content) for content in contents],
        )

-        embeddings = [data["embedding"] for data in response["data"]]
+        embeddings = await [data["embedding"] for data in response["data"]]
        return EmbeddingsResponse(embeddings=embeddings)