fix(llm_http_handler.py): fix fake streaming (#10061)

* fix(llm_http_handler.py): fix fake streaming allows groq to work with llm_http_handler * fix(groq.py): migrate groq to openai like config ensures json mode handling works correctly
2025-04-25 18:54:30 +00:00 · 2025-04-16 10:15:11 -07:00 · 2025-04-16 10:15:11 -07:00 · 3a3cc97fc8
commit 3a3cc97fc8
parent dc29fc2ea0
5 changed files with 157 additions and 19 deletions
--- a/litellm/llms/custom_httpx/llm_http_handler.py
+++ b/litellm/llms/custom_httpx/llm_http_handler.py
@ -11,6 +11,7 @@ from litellm._logging import verbose_logger
 from litellm.llms.base_llm.audio_transcription.transformation import (
    BaseAudioTranscriptionConfig,
 )
+from litellm.llms.base_llm.base_model_iterator import MockResponseIterator
 from litellm.llms.base_llm.chat.transformation import BaseConfig
 from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
 from litellm.llms.base_llm.files.transformation import BaseFilesConfig
@ -231,6 +232,7 @@ class BaseLLMHTTPHandler:
    ):
        json_mode: bool = optional_params.pop("json_mode", False)
        extra_body: Optional[dict] = optional_params.pop("extra_body", None)
+        fake_stream = fake_stream or optional_params.pop("fake_stream", False)

        provider_config = ProviderConfigManager.get_provider_chat_config(
            model=model, provider=litellm.LlmProviders(custom_llm_provider)
@ -317,6 +319,7 @@ class BaseLLMHTTPHandler:
                    ),
                    litellm_params=litellm_params,
                    json_mode=json_mode,
+                    optional_params=optional_params,
                )

            else:
@ -378,6 +381,7 @@ class BaseLLMHTTPHandler:
                ),
                litellm_params=litellm_params,
                json_mode=json_mode,
+                optional_params=optional_params,
            )
            return CustomStreamWrapper(
                completion_stream=completion_stream,
@ -426,6 +430,7 @@ class BaseLLMHTTPHandler:
        model: str,
        messages: list,
        logging_obj,
+        optional_params: dict,
        litellm_params: dict,
        timeout: Union[float, httpx.Timeout],
        fake_stream: bool = False,
@ -457,11 +462,22 @@ class BaseLLMHTTPHandler:
        )

        if fake_stream is True:
-            completion_stream = provider_config.get_model_response_iterator(
-                streaming_response=response.json(),
-                sync_stream=True,
+            model_response: (ModelResponse) = provider_config.transform_response(
+                model=model,
+                raw_response=response,
+                model_response=litellm.ModelResponse(),
+                logging_obj=logging_obj,
+                request_data=data,
+                messages=messages,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                encoding=None,
                json_mode=json_mode,
            )
+
+            completion_stream: Any = MockResponseIterator(
+                model_response=model_response, json_mode=json_mode
+            )
        else:
            completion_stream = provider_config.get_model_response_iterator(
                streaming_response=response.iter_lines(),
@ -491,6 +507,7 @@ class BaseLLMHTTPHandler:
        logging_obj: LiteLLMLoggingObj,
        data: dict,
        litellm_params: dict,
+        optional_params: dict,
        fake_stream: bool = False,
        client: Optional[AsyncHTTPHandler] = None,
        json_mode: Optional[bool] = None,
@ -509,6 +526,7 @@ class BaseLLMHTTPHandler:
            )

        completion_stream, _response_headers = await self.make_async_call_stream_helper(
+            model=model,
            custom_llm_provider=custom_llm_provider,
            provider_config=provider_config,
            api_base=api_base,
@ -520,6 +538,8 @@ class BaseLLMHTTPHandler:
            fake_stream=fake_stream,
            client=client,
            litellm_params=litellm_params,
+            optional_params=optional_params,
+            json_mode=json_mode,
        )
        streamwrapper = CustomStreamWrapper(
            completion_stream=completion_stream,
@ -531,6 +551,7 @@ class BaseLLMHTTPHandler:

    async def make_async_call_stream_helper(
        self,
+        model: str,
        custom_llm_provider: str,
        provider_config: BaseConfig,
        api_base: str,
@ -540,8 +561,10 @@ class BaseLLMHTTPHandler:
        logging_obj: LiteLLMLoggingObj,
        timeout: Union[float, httpx.Timeout],
        litellm_params: dict,
+        optional_params: dict,
        fake_stream: bool = False,
        client: Optional[AsyncHTTPHandler] = None,
+        json_mode: Optional[bool] = None,
    ) -> Tuple[Any, httpx.Headers]:
        """
        Helper function for making an async call with stream.
@ -572,8 +595,21 @@ class BaseLLMHTTPHandler:
        )

        if fake_stream is True:
-            completion_stream = provider_config.get_model_response_iterator(
-                streaming_response=response.json(), sync_stream=False
+            model_response: (ModelResponse) = provider_config.transform_response(
+                model=model,
+                raw_response=response,
+                model_response=litellm.ModelResponse(),
+                logging_obj=logging_obj,
+                request_data=data,
+                messages=messages,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                encoding=None,
+                json_mode=json_mode,
+            )
+
+            completion_stream: Any = MockResponseIterator(
+                model_response=model_response, json_mode=json_mode
            )
        else:
            completion_stream = provider_config.get_model_response_iterator(
@ -598,8 +634,12 @@ class BaseLLMHTTPHandler:
        """
        Some providers like Bedrock invoke do not support the stream parameter in the request body, we only pass `stream` in the request body the provider supports it.
        """
+
        if fake_stream is True:
-            return data
+            # remove 'stream' from data
+            new_data = data.copy()
+            new_data.pop("stream", None)
+            return new_data
        if provider_config.supports_stream_param_in_request_body is True:
            data["stream"] = True
        return data