fix(inference): AttributeError in streaming response cleanup (#4236)

This PR fixes issue #3185 
The code calls `await event_gen.aclose()` but OpenAI's `AsyncStream`
doesn't have an `aclose()` method - it has `close()` (which is async).
when clients cancel streaming requests, the server tries to clean up
with:

```python
await event_gen.aclose()  #  AsyncStream doesn't have aclose()!
```

But `AsyncStream` has never had a public `aclose()` method. The error
message literally tells us:

```
AttributeError: 'AsyncStream' object has no attribute 'aclose'. Did you mean: 'close'?
                                                                            ^^^^^^^^
```

## Verification
* Reproduction script
[`reproduce_issue_3185.sh`](https://gist.github.com/r-bit-rry/dea4f8fbb81c446f5db50ea7abd6379b)
can be used to verify the fix.
*   Manual checks, validation against original OpenAI library code
This commit is contained in:
Roy Belio 2025-12-14 13:51:09 +01:00 committed by GitHub
parent dfb9f6743a
commit c574db5f1d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 213 additions and 30 deletions

View file

@ -14,6 +14,7 @@ from llama_stack.log import get_logger
from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
from llama_stack.providers.utils.inference.stream_utils import wrap_async_stream
from llama_stack_api import (
Model,
ModelType,
@ -177,7 +178,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
async def openai_completion(
self,
params: OpenAICompletionRequestWithExtraBody,
) -> OpenAICompletion:
) -> OpenAICompletion | AsyncIterator[OpenAICompletion]:
"""
Override parent method to add watsonx-specific parameters.
"""
@ -210,7 +211,12 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
timeout=self.config.timeout,
project_id=self.config.project_id,
)
return await litellm.atext_completion(**request_params)
result = await litellm.atext_completion(**request_params)
if params.stream:
return wrap_async_stream(result) # type: ignore[arg-type] # LiteLLM streaming types
return result # type: ignore[return-value] # external lib lacks type stubs
async def openai_embeddings(
self,