feat: switch to async completion in LiteLLM OpenAI mixin

- Replace synchronous litellm.completion() with async litellm.acompletion()
  - Update comment to reference official LiteLLM async completion docs
  - Improves async compatibility in the inference provider
This commit is contained in:
Eran Cohen 2025-08-03 17:09:42 +03:00
parent dbfc15123e
commit b1829e3497

View file

@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin(
params["model"] = self.get_litellm_model_name(params["model"]) params["model"] = self.get_litellm_model_name(params["model"])
logger.debug(f"params to litellm (openai compat): {params}") logger.debug(f"params to litellm (openai compat): {params}")
# unfortunately, we need to use synchronous litellm.completion here because litellm # see https://docs.litellm.ai/docs/completion/stream#async-completion
# caches various httpx.client objects in a non-eventloop aware manner response = await litellm.acompletion(**params)
response = litellm.completion(**params)
if stream: if stream:
return self._stream_chat_completion(response) return self._stream_chat_completion(response)
else: else:
@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin(
self, response: litellm.ModelResponse self, response: litellm.ModelResponse
) -> AsyncIterator[ChatCompletionResponseStreamChunk]: ) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
async def _stream_generator(): async def _stream_generator():
for chunk in response: async for chunk in response:
yield chunk yield chunk
async for chunk in convert_openai_chat_completion_stream( async for chunk in convert_openai_chat_completion_stream(