diff --git a/litellm/llms/custom_httpx/llm_http_handler.py b/litellm/llms/custom_httpx/llm_http_handler.py index 6b7eda3f6c..5ce05f4da7 100644 --- a/litellm/llms/custom_httpx/llm_http_handler.py +++ b/litellm/llms/custom_httpx/llm_http_handler.py @@ -18,6 +18,7 @@ from litellm.llms.custom_httpx.http_handler import ( _get_httpx_client, get_async_httpx_client, ) +from litellm.responses.streaming_iterator import ResponsesAPIStreamingIterator from litellm.types.llms.openai import ( ResponseInputParam, ResponsesAPIOptionalRequestParams, @@ -973,7 +974,7 @@ class BaseLLMHTTPHandler: extra_headers: Optional[Dict[str, Any]] = None, extra_body: Optional[Dict[str, Any]] = None, timeout: Optional[Union[float, httpx.Timeout]] = None, - ) -> ResponsesAPIResponse: + ) -> Union[ResponsesAPIResponse, ResponsesAPIStreamingIterator]: if client is None or not isinstance(client, AsyncHTTPHandler): async_httpx_client = get_async_httpx_client( llm_provider=litellm.LlmProviders(custom_llm_provider) @@ -1010,13 +1011,34 @@ class BaseLLMHTTPHandler: }, ) + # Check if streaming is requested + stream = response_api_optional_request_params.get("stream", False) + try: - response = await async_httpx_client.post( - url=api_base, - headers=headers, - data=json.dumps(data), - timeout=response_api_optional_request_params.get("timeout"), - ) + if stream: + # For streaming, we need to use stream=True in the request + response = await async_httpx_client.post( + url=api_base, + headers=headers, + data=json.dumps(data), + timeout=response_api_optional_request_params.get("timeout"), + stream=True, + ) + + # Return the streaming iterator + return ResponsesAPIStreamingIterator( + response=response, + model=model, + logging_obj=logging_obj, + ) + else: + # For non-streaming, proceed as before + response = await async_httpx_client.post( + url=api_base, + headers=headers, + data=json.dumps(data), + timeout=response_api_optional_request_params.get("timeout"), + ) except Exception as e: raise self._handle_error( e=e,