diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index befb4b092..da2e634f6 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin( params["model"] = self.get_litellm_model_name(params["model"]) logger.debug(f"params to litellm (openai compat): {params}") - # unfortunately, we need to use synchronous litellm.completion here because litellm - # caches various httpx.client objects in a non-eventloop aware manner - response = litellm.completion(**params) + # see https://docs.litellm.ai/docs/completion/stream#async-completion + response = await litellm.acompletion(**params) if stream: return self._stream_chat_completion(response) else: @@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin( self, response: litellm.ModelResponse ) -> AsyncIterator[ChatCompletionResponseStreamChunk]: async def _stream_generator(): - for chunk in response: + async for chunk in response: yield chunk async for chunk in convert_openai_chat_completion_stream( diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json index 1ae018397..203fc51a5 100644 --- a/tests/integration/test_cases/inference/chat_completion.json +++ b/tests/integration/test_cases/inference/chat_completion.json @@ -78,7 +78,7 @@ }, { "role": "user", - "content": "What's the weather like in San Francisco?" + "content": "What's the weather like in San Francisco, CA?" } ], "tools": [