fix(sagemaker.py): support streaming for messages api

Fixes https://github.com/BerriAI/litellm/issues/5372
2025-04-25 10:44:24 +00:00 · 2024-08-26 15:08:08 -07:00 · 2024-08-26 15:08:08 -07:00 · 8e9acd117b
commit 8e9acd117b
parent 174b1c43e3
8 changed files with 142 additions and 32 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -856,7 +856,7 @@ from .llms.vertex_httpx import (
 from .llms.vertex_ai import VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.vertex_ai_partner import VertexAILlama3Config
-from .llms.sagemaker import SagemakerConfig
+from .llms.sagemaker.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
 from .llms.maritalk import MaritTalkConfig
--- a/litellm/litellm_core_utils/streaming_utils.py
+++ b/litellm/litellm_core_utils/streaming_utils.py
@ -13,4 +13,6 @@ def generic_chunk_has_all_required_fields(chunk: dict) -> bool:
    # this is an optional field in GenericStreamingChunk, it's not required to be present
    _all_fields.pop("provider_specific_fields", None)
-    return all(key in chunk for key in _all_fields)
+    decision = all(key in _all_fields for key in chunk)
    return decision
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -7,7 +7,7 @@ import time
 import types
 from enum import Enum
 from functools import partial
-from typing import Callable, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, List, Literal, Optional, Tuple, Union
 import httpx  # type: ignore
 import requests  # type: ignore
@ -22,7 +22,11 @@ from litellm.types.llms.openai import (
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionUsageBlock,
 )
-from litellm.types.utils import GenericStreamingChunk, ProviderField
+from litellm.types.utils import (
    CustomStreamingDecoder,
    GenericStreamingChunk,
    ProviderField,
 )
 from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
 from .base import BaseLLM
@ -171,15 +175,21 @@ async def make_call(
    model: str,
    messages: list,
    logging_obj,
    streaming_decoder: Optional[CustomStreamingDecoder] = None,
 ):
    response = await client.post(api_base, headers=headers, data=data, stream=True)
    if response.status_code != 200:
        raise DatabricksError(status_code=response.status_code, message=response.text)
-    completion_stream = ModelResponseIterator(
+    if streaming_decoder is not None:
-        streaming_response=response.aiter_lines(), sync_stream=False
+        completion_stream: Any = streaming_decoder.aiter_bytes(
-    )
+            response.aiter_bytes(chunk_size=1024)
        )
    else:
        completion_stream = ModelResponseIterator(
            streaming_response=response.aiter_lines(), sync_stream=False
        )
    # LOGGING
    logging_obj.post_call(
        input=messages,
@ -199,6 +209,7 @@ def make_sync_call(
    model: str,
    messages: list,
    logging_obj,
    streaming_decoder: Optional[CustomStreamingDecoder] = None,
 ):
    if client is None:
        client = HTTPHandler()  # Create a new client if none provided
@ -208,9 +219,14 @@ def make_sync_call(
    if response.status_code != 200:
        raise DatabricksError(status_code=response.status_code, message=response.read())
-    completion_stream = ModelResponseIterator(
+    if streaming_decoder is not None:
-        streaming_response=response.iter_lines(), sync_stream=True
+        completion_stream = streaming_decoder.iter_bytes(
-    )
+            response.iter_bytes(chunk_size=1024)
        )
    else:
        completion_stream = ModelResponseIterator(
            streaming_response=response.iter_lines(), sync_stream=True
        )
    # LOGGING
    logging_obj.post_call(
@ -283,6 +299,7 @@ class DatabricksChatCompletion(BaseLLM):
        logger_fn=None,
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
        streaming_decoder: Optional[CustomStreamingDecoder] = None,
    ) -> CustomStreamWrapper:
        data["stream"] = True
@ -296,6 +313,7 @@ class DatabricksChatCompletion(BaseLLM):
                model=model,
                messages=messages,
                logging_obj=logging_obj,
                streaming_decoder=streaming_decoder,
            ),
            model=model,
            custom_llm_provider=custom_llm_provider,
@ -371,6 +389,9 @@ class DatabricksChatCompletion(BaseLLM):
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
        custom_endpoint: Optional[bool] = None,
        streaming_decoder: Optional[
            CustomStreamingDecoder
        ] = None,  # if openai-compatible api needs custom stream decoder - e.g. sagemaker
    ):
        custom_endpoint = custom_endpoint or optional_params.pop(
            "custom_endpoint", None
@ -436,6 +457,7 @@ class DatabricksChatCompletion(BaseLLM):
                    headers=headers,
                    client=client,
                    custom_llm_provider=custom_llm_provider,
                    streaming_decoder=streaming_decoder,
                )
            else:
                return self.acompletion_function(
@ -473,6 +495,7 @@ class DatabricksChatCompletion(BaseLLM):
                        model=model,
                        messages=messages,
                        logging_obj=logging_obj,
                        streaming_decoder=streaming_decoder,
                    ),
                    model=model,
                    custom_llm_provider=custom_llm_provider,
--- a/litellm/llms/sagemaker/sagemaker.py
+++ b/litellm/llms/sagemaker/sagemaker.py
@ -24,8 +24,11 @@ from litellm.llms.custom_httpx.http_handler import (
 from litellm.types.llms.openai import (
    ChatCompletionToolCallChunk,
    ChatCompletionUsageBlock,
    OpenAIChatCompletionChunk,
 )
 from litellm.types.utils import CustomStreamingDecoder
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import StreamingChatCompletionChunk
 from litellm.utils import (
    CustomStreamWrapper,
    EmbeddingResponse,
@ -34,8 +37,8 @@ from litellm.utils import (
    get_secret,
 )
-from .base_aws_llm import BaseAWSLLM
+from ..base_aws_llm import BaseAWSLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 _response_stream_shape_cache = None
@ -241,6 +244,10 @@ class SagemakerLLM(BaseAWSLLM):
                aws_region_name=aws_region_name,
            )
            custom_stream_decoder = AWSEventStreamDecoder(
                model="", is_messages_api=True
            )
            return openai_like_chat_completions.completion(
                model=model,
                messages=messages,
@ -259,6 +266,7 @@ class SagemakerLLM(BaseAWSLLM):
                headers=prepared_request.headers,
                custom_endpoint=True,
                custom_llm_provider="sagemaker_chat",
                streaming_decoder=custom_stream_decoder,  # type: ignore
            )
        ## Load Config
@ -332,7 +340,7 @@ class SagemakerLLM(BaseAWSLLM):
                )
                return response
            else:
-                if stream is not None and stream == True:
+                if stream is not None and stream is True:
                    sync_handler = _get_httpx_client()
                    sync_response = sync_handler.post(
                        url=prepared_request.url,
@ -847,12 +855,21 @@ def get_response_stream_shape():
 class AWSEventStreamDecoder:
-    def __init__(self, model: str) -> None:
+    def __init__(self, model: str, is_messages_api: Optional[bool] = None) -> None:
        from botocore.parsers import EventStreamJSONParser
        self.model = model
        self.parser = EventStreamJSONParser()
        self.content_blocks: List = []
        self.is_messages_api = is_messages_api
    def _chunk_parser_messages_api(
        self, chunk_data: dict
    ) -> StreamingChatCompletionChunk:
        openai_chunk = StreamingChatCompletionChunk(**chunk_data)
        return openai_chunk
    def _chunk_parser(self, chunk_data: dict) -> GChunk:
        verbose_logger.debug("in sagemaker chunk parser, chunk_data %s", chunk_data)
@ -868,6 +885,7 @@ class AWSEventStreamDecoder:
                index=_index,
                is_finished=True,
                finish_reason="stop",
                usage=None,
            )
        return GChunk(
@ -875,9 +893,12 @@ class AWSEventStreamDecoder:
            index=_index,
            is_finished=is_finished,
            finish_reason=finish_reason,
            usage=None,
        )
-    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GChunk]:
+    def iter_bytes(
        self, iterator: Iterator[bytes]
    ) -> Iterator[Optional[Union[GChunk, StreamingChatCompletionChunk]]]:
        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
@ -898,7 +919,10 @@ class AWSEventStreamDecoder:
                    # Try to parse the accumulated JSON
                    try:
                        _data = json.loads(accumulated_json)
-                        yield self._chunk_parser(chunk_data=_data)
+                        if self.is_messages_api:
                            yield self._chunk_parser_messages_api(chunk_data=_data)
                        else:
                            yield self._chunk_parser(chunk_data=_data)
                        # Reset accumulated_json after successful parsing
                        accumulated_json = ""
                    except json.JSONDecodeError:
@ -909,16 +933,20 @@ class AWSEventStreamDecoder:
        if accumulated_json:
            try:
                _data = json.loads(accumulated_json)
-                yield self._chunk_parser(chunk_data=_data)
+                if self.is_messages_api:
-            except json.JSONDecodeError:
+                    yield self._chunk_parser_messages_api(chunk_data=_data)
                else:
                    yield self._chunk_parser(chunk_data=_data)
            except json.JSONDecodeError as e:
                # Handle or log any unparseable data at the end
                verbose_logger.error(
                    f"Warning: Unparseable JSON data remained: {accumulated_json}"
                )
                yield None
    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
-    ) -> AsyncIterator[GChunk]:
+    ) -> AsyncIterator[Optional[Union[GChunk, StreamingChatCompletionChunk]]]:
        """Given an async iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
@ -940,7 +968,10 @@ class AWSEventStreamDecoder:
                    # Try to parse the accumulated JSON
                    try:
                        _data = json.loads(accumulated_json)
-                        yield self._chunk_parser(chunk_data=_data)
+                        if self.is_messages_api:
                            yield self._chunk_parser_messages_api(chunk_data=_data)
                        else:
                            yield self._chunk_parser(chunk_data=_data)
                        # Reset accumulated_json after successful parsing
                        accumulated_json = ""
                    except json.JSONDecodeError:
@ -951,12 +982,16 @@ class AWSEventStreamDecoder:
        if accumulated_json:
            try:
                _data = json.loads(accumulated_json)
-                yield self._chunk_parser(chunk_data=_data)
+                if self.is_messages_api:
                    yield self._chunk_parser_messages_api(chunk_data=_data)
                else:
                    yield self._chunk_parser(chunk_data=_data)
            except json.JSONDecodeError:
                # Handle or log any unparseable data at the end
                verbose_logger.error(
                    f"Warning: Unparseable JSON data remained: {accumulated_json}"
                )
                yield None
    def _parse_message_from_event(self, event) -> Optional[str]:
        response_dict = event.to_response_dict()
--- a/litellm/main.py
+++ b/litellm/main.py
@ -119,7 +119,7 @@ from .llms.prompt_templates.factory import (
    prompt_factory,
    stringify_json_tool_call_content,
 )
-from .llms.sagemaker import SagemakerLLM
+from .llms.sagemaker.sagemaker import SagemakerLLM
 from .llms.text_completion_codestral import CodestralTextCompletion
 from .llms.text_to_speech.vertex_ai import VertexTextToSpeechAPI
 from .llms.triton import TritonChatCompletion
--- a/litellm/tests/test_sagemaker.py
+++ b/litellm/tests/test_sagemaker.py
@ -120,15 +120,24 @@ async def test_completion_sagemaker_messages_api(sync_mode):
@pytest.mark.asyncio()
@pytest.mark.parametrize("sync_mode", [False, True])
-async def test_completion_sagemaker_stream(sync_mode):
+@pytest.mark.parametrize(
    "model",
    [
        "sagemaker_chat/huggingface-pytorch-tgi-inference-2024-08-23-15-48-59-245",
        "sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
    ],
 )
 async def test_completion_sagemaker_stream(sync_mode, model):
    try:
        from litellm.tests.test_streaming import streaming_format_tests
        litellm.set_verbose = False
        print("testing sagemaker")
        verbose_logger.setLevel(logging.DEBUG)
        full_text = ""
        if sync_mode is True:
            response = litellm.completion(
-                model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
+                model=model,
                messages=[
                    {"role": "user", "content": "hi - what is ur name"},
                ],
@ -138,14 +147,15 @@ async def test_completion_sagemaker_stream(sync_mode):
                input_cost_per_second=0.000420,
            )
-            for chunk in response:
+            for idx, chunk in enumerate(response):
                print(chunk)
                streaming_format_tests(idx=idx, chunk=chunk)
                full_text += chunk.choices[0].delta.content or ""
            print("SYNC RESPONSE full text", full_text)
        else:
            response = await litellm.acompletion(
-                model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
+                model=model,
                messages=[
                    {"role": "user", "content": "hi - what is ur name"},
                ],
@ -156,10 +166,12 @@ async def test_completion_sagemaker_stream(sync_mode):
            )
            print("streaming response")
-
+            idx = 0
            async for chunk in response:
                print(chunk)
                streaming_format_tests(idx=idx, chunk=chunk)
                full_text += chunk.choices[0].delta.content or ""
                idx += 1
            print("ASYNC RESPONSE full text", full_text)
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -29,6 +29,7 @@ from openai.types.beta.thread_create_params import (
 from openai.types.beta.threads.message import Message as OpenAIMessage
 from openai.types.beta.threads.message_content import MessageContent
 from openai.types.beta.threads.run import Run
 from openai.types.chat import ChatCompletionChunk
 from pydantic import BaseModel, Field
 from typing_extensions import Dict, Required, TypedDict, override
@ -456,6 +457,13 @@ class ChatCompletionUsageBlock(TypedDict):
    total_tokens: int
 class OpenAIChatCompletionChunk(ChatCompletionChunk):
    def __init__(self, **kwargs):
        # Set the 'object' kwarg to 'chat.completion.chunk'
        kwargs["object"] = "chat.completion.chunk"
        super().__init__(**kwargs)
 class Hyperparameters(BaseModel):
    batch_size: Optional[Union[str, int]] = None  # "Number of examples in each batch."
    learning_rate_multiplier: Optional[Union[str, float]] = (
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -5,11 +5,16 @@ from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 from openai._models import BaseModel as OpenAIObject
 from openai.types.completion_usage import CompletionUsage
 from pydantic import ConfigDict, Field, PrivateAttr
 from typing_extensions import Callable, Dict, Required, TypedDict, override
 from ..litellm_core_utils.core_helpers import map_finish_reason
-from .llms.openai import ChatCompletionToolCallChunk, ChatCompletionUsageBlock
+from .llms.openai import (
    ChatCompletionToolCallChunk,
    ChatCompletionUsageBlock,
    OpenAIChatCompletionChunk,
 )
 def _generate_id():  # private helper function
@ -85,7 +90,7 @@ class GenericStreamingChunk(TypedDict, total=False):
    tool_use: Optional[ChatCompletionToolCallChunk]
    is_finished: Required[bool]
    finish_reason: Required[str]
-    usage: Optional[ChatCompletionUsageBlock]
+    usage: Required[Optional[ChatCompletionUsageBlock]]
    index: int
    # use this dict if you want to return any provider specific fields in the response
@ -448,9 +453,6 @@ class Choices(OpenAIObject):
        setattr(self, key, value)
 from openai.types.completion_usage import CompletionUsage
 class Usage(CompletionUsage):
    def __init__(
        self,
@ -535,6 +537,17 @@ class StreamingChoices(OpenAIObject):
        setattr(self, key, value)
 class StreamingChatCompletionChunk(OpenAIChatCompletionChunk):
    def __init__(self, **kwargs):
        new_choices = []
        for choice in kwargs["choices"]:
            new_choice = StreamingChoices(**choice).model_dump()
            new_choices.append(new_choice)
        kwargs["choices"] = new_choices
        super().__init__(**kwargs)
 class ModelResponse(OpenAIObject):
    id: str
    """A unique identifier for the completion."""
@ -1231,3 +1244,20 @@ class StandardLoggingPayload(TypedDict):
    response: Optional[Union[str, list, dict]]
    model_parameters: dict
    hidden_params: StandardLoggingHiddenParams
 from typing import AsyncIterator, Iterator
 class CustomStreamingDecoder:
    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
    ) -> AsyncIterator[
        Optional[Union[GenericStreamingChunk, StreamingChatCompletionChunk]]
    ]:
        raise NotImplementedError
    def iter_bytes(
        self, iterator: Iterator[bytes]
    ) -> Iterator[Optional[Union[GenericStreamingChunk, StreamingChatCompletionChunk]]]:
        raise NotImplementedError