feat(bedrock_httpx.py): working bedrock command-r sync+async streaming

2024-05-11 19:39:51 -07:00 · 2024-05-11 19:39:51 -07:00 · 64650c0279
commit 64650c0279
parent 49ab1a1d3f
6 changed files with 342 additions and 51 deletions
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -7,7 +7,18 @@ import json
 from enum import Enum
 import requests, copy  # type: ignore
 import time
-from typing import Callable, Optional, List, Literal, Union, Any, TypedDict, Tuple
+from typing import (
    Callable,
    Optional,
    List,
    Literal,
    Union,
    Any,
    TypedDict,
    Tuple,
    Iterator,
    AsyncIterator,
 )
 from litellm.utils import (
    ModelResponse,
    Usage,
@ -330,10 +341,10 @@ class BedrockLLM(BaseLLM):
        encoding,
        logging_obj,
        optional_params: dict,
        acompletion: bool,
        timeout: Optional[Union[float, httpx.Timeout]],
        litellm_params=None,
        logger_fn=None,
        acompletion: bool = False,
        extra_headers: Optional[dict] = None,
        client: Optional[Union[AsyncHTTPHandler, HTTPHandler]] = None,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
@ -346,6 +357,9 @@ class BedrockLLM(BaseLLM):
        except ImportError as e:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        ## SETUP ##
        stream = optional_params.pop("stream", None)
        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
@ -400,7 +414,10 @@ class BedrockLLM(BaseLLM):
        else:
            endpoint_url = f"https://bedrock-runtime.{aws_region_name}.amazonaws.com"
-        endpoint_url = f"{endpoint_url}/model/{model}/invoke"
+        if stream is not None and stream == True:
            endpoint_url = f"{endpoint_url}/model/{model}/invoke-with-response-stream"
        else:
            endpoint_url = f"{endpoint_url}/model/{model}/invoke"
        sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)
@ -409,7 +426,6 @@ class BedrockLLM(BaseLLM):
            model, messages, provider, custom_prompt_dict
        )
        inference_params = copy.deepcopy(optional_params)
        stream = inference_params.pop("stream", False)
        if provider == "cohere":
            if model.startswith("cohere.command-r"):
@ -420,11 +436,6 @@ class BedrockLLM(BaseLLM):
                        k not in inference_params
                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                        inference_params[k] = v
                if optional_params.get("stream", False) == True:
                    inference_params["stream"] = (
                        True  # cohere requires stream = True in inference params
                    )
                _data = {"message": prompt, **inference_params}
                if chat_history is not None:
                    _data["chat_history"] = chat_history
@ -437,7 +448,7 @@ class BedrockLLM(BaseLLM):
                        k not in inference_params
                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                        inference_params[k] = v
-                if optional_params.get("stream", False) == True:
+                if stream == True:
                    inference_params["stream"] = (
                        True  # cohere requires stream = True in inference params
                    )
@ -446,6 +457,7 @@ class BedrockLLM(BaseLLM):
            raise Exception("UNSUPPORTED PROVIDER")
        ## COMPLETION CALL
        headers = {"Content-Type": "application/json"}
        if extra_headers is not None:
            headers = {"Content-Type": "application/json", **extra_headers}
@ -455,11 +467,39 @@ class BedrockLLM(BaseLLM):
        sigv4.add_auth(request)
        prepped = request.prepare()
        ## LOGGING
        logging_obj.pre_call(
            input=messages,
            api_key="",
            additional_args={
                "complete_input_dict": data,
                "api_base": prepped.url,
                "headers": prepped.headers,
            },
        )
        ### ROUTING (ASYNC, STREAMING, SYNC)
        if acompletion:
            if isinstance(client, HTTPHandler):
                client = None
-
+            if stream:
                return self.async_streaming(
                    model=model,
                    messages=messages,
                    data=data,
                    api_base=prepped.url,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    encoding=encoding,
                    logging_obj=logging_obj,
                    optional_params=optional_params,
                    stream=True,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=prepped.headers,
                    timeout=timeout,
                    client=client,
                )  # type: ignore
            ### ASYNC COMPLETION
            return self.async_completion(
                model=model,
@ -488,17 +528,29 @@ class BedrockLLM(BaseLLM):
            self.client = HTTPHandler(**_params)  # type: ignore
        else:
            self.client = client
        if stream is not None and stream == True:
            response = self.client.post(
                url=prepped.url,
                headers=prepped.headers,  # type: ignore
                data=data,
                stream=stream,
            )
-        ## LOGGING
+            if response.status_code != 200:
-        logging_obj.pre_call(
+                raise BedrockError(
-            input=messages,
+                    status_code=response.status_code, message=response.text
-            api_key="",
+                )
-            additional_args={
+
-                "complete_input_dict": data,
+            decoder = AWSEventStreamDecoder()
-                "api_base": prepped.url,
+
-                "headers": prepped.headers,
+            completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
-            },
+            streaming_response = CustomStreamWrapper(
-        )
+                completion_stream=completion_stream,
                model=model,
                custom_llm_provider="bedrock",
                logging_obj=logging_obj,
            )
            return streaming_response
        response = self.client.post(url=prepped.url, headers=prepped.headers, data=data)  # type: ignore
@ -565,5 +617,117 @@ class BedrockLLM(BaseLLM):
            encoding=encoding,
        )
    async def async_streaming(
        self,
        model: str,
        messages: list,
        api_base: str,
        model_response: ModelResponse,
        print_verbose: Callable,
        data: str,
        timeout: Optional[Union[float, httpx.Timeout]],
        encoding,
        logging_obj,
        stream,
        optional_params: dict,
        litellm_params=None,
        logger_fn=None,
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
    ) -> CustomStreamWrapper:
        if client is None:
            _params = {}
            if timeout is not None:
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
            self.client = AsyncHTTPHandler(**_params)  # type: ignore
        else:
            self.client = client  # type: ignore
        response = await self.client.post(api_base, headers=headers, data=data, stream=True)  # type: ignore
        if response.status_code != 200:
            raise BedrockError(status_code=response.status_code, message=response.text)
        decoder = AWSEventStreamDecoder()
        completion_stream = decoder.aiter_bytes(response.aiter_bytes(chunk_size=1024))
        streaming_response = CustomStreamWrapper(
            completion_stream=completion_stream,
            model=model,
            custom_llm_provider="bedrock",
            logging_obj=logging_obj,
        )
        return streaming_response
    def embedding(self, *args, **kwargs):
        return super().embedding(*args, **kwargs)
 def get_response_stream_shape():
    from botocore.model import ServiceModel
    from botocore.loaders import Loader
    loader = Loader()
    bedrock_service_dict = loader.load_service_model("bedrock-runtime", "service-2")
    bedrock_service_model = ServiceModel(bedrock_service_dict)
    return bedrock_service_model.shape_for("ResponseStream")
 class AWSEventStreamDecoder:
    def __init__(self) -> None:
        from botocore.parsers import EventStreamJSONParser
        self.parser = EventStreamJSONParser()
    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
        event_stream_buffer = EventStreamBuffer()
        for chunk in iterator:
            event_stream_buffer.add_data(chunk)
            for event in event_stream_buffer:
                message = self._parse_message_from_event(event)
                if message:
                    # sse_event = ServerSentEvent(data=message, event="completion")
                    _data = json.loads(message)
                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
                        text=_data.get("text", ""),
                        is_finished=_data.get("is_finished", False),
                        finish_reason=_data.get("finish_reason", ""),
                    )
                    yield streaming_chunk
    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
    ) -> AsyncIterator[GenericStreamingChunk]:
        """Given an async iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
        event_stream_buffer = EventStreamBuffer()
        async for chunk in iterator:
            event_stream_buffer.add_data(chunk)
            for event in event_stream_buffer:
                message = self._parse_message_from_event(event)
                if message:
                    _data = json.loads(message)
                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
                        text=_data.get("text", ""),
                        is_finished=_data.get("is_finished", False),
                        finish_reason=_data.get("finish_reason", ""),
                    )
                    yield streaming_chunk
    def _parse_message_from_event(self, event) -> str | None:
        response_dict = event.to_response_dict()
        parsed_response = self.parser.parse(response_dict, get_response_stream_shape())
        if response_dict["status_code"] != 200:
            raise ValueError(f"Bad response code, expected 200: {response_dict}")
        chunk = parsed_response.get("chunk")
        if not chunk:
            return None
        return chunk.get("bytes").decode()  # type: ignore[no-any-return]
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -91,11 +91,15 @@ class HTTPHandler:
    def post(
        self,
        url: str,
-        data: Optional[dict] = None,
+        data: Optional[Union[dict, str]] = None,
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
        stream: bool = False,
    ):
-        response = self.client.post(url, data=data, params=params, headers=headers)
+        req = self.client.build_request(
            "POST", url, data=data, params=params, headers=headers  # type: ignore
        )
        response = self.client.send(req, stream=stream)
        return response
    def __del__(self) -> None:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -257,7 +257,7 @@ async def acompletion(
        - If `stream` is True, the function returns an async generator that yields completion lines.
    """
    loop = asyncio.get_event_loop()
-    custom_llm_provider = None
+    custom_llm_provider = kwargs.get("custom_llm_provider", None)
    # Adjusted to use explicit arguments instead of *args and **kwargs
    completion_kwargs = {
        "model": model,
@ -289,9 +289,10 @@ async def acompletion(
        "model_list": model_list,
        "acompletion": True,  # assuming this is a required parameter
    }
-    _, custom_llm_provider, _, _ = get_llm_provider(
+    if custom_llm_provider is None:
-        model=model, api_base=completion_kwargs.get("base_url", None)
+        _, custom_llm_provider, _, _ = get_llm_provider(
-    )
+            model=model, api_base=completion_kwargs.get("base_url", None)
        )
    try:
        # Use a partial function to pass your keyword arguments
        func = partial(completion, **completion_kwargs, **kwargs)
@ -300,9 +301,6 @@ async def acompletion(
        ctx = contextvars.copy_context()
        func_with_context = partial(ctx.run, func)
        _, custom_llm_provider, _, _ = get_llm_provider(
            model=model, api_base=kwargs.get("api_base", None)
        )
        if (
            custom_llm_provider == "openai"
            or custom_llm_provider == "azure"
@ -324,6 +322,7 @@ async def acompletion(
            or custom_llm_provider == "sagemaker"
            or custom_llm_provider == "anthropic"
            or custom_llm_provider == "predibase"
            or (custom_llm_provider == "bedrock" and "cohere" in model)
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -1937,6 +1936,7 @@ def completion(
                    logging_obj=logging,
                    extra_headers=extra_headers,
                    timeout=timeout,
                    acompletion=acompletion,
                )
            else:
                response = bedrock.completion(
@ -1954,26 +1954,26 @@ def completion(
                    timeout=timeout,
                )
-            if (
+                if (
-                "stream" in optional_params
+                    "stream" in optional_params
-                and optional_params["stream"] == True
+                    and optional_params["stream"] == True
-                and not isinstance(response, CustomStreamWrapper)
+                    and not isinstance(response, CustomStreamWrapper)
-            ):
+                ):
-                # don't try to access stream object,
+                    # don't try to access stream object,
-                if "ai21" in model:
+                    if "ai21" in model:
-                    response = CustomStreamWrapper(
+                        response = CustomStreamWrapper(
-                        response,
+                            response,
-                        model,
+                            model,
-                        custom_llm_provider="bedrock",
+                            custom_llm_provider="bedrock",
-                        logging_obj=logging,
+                            logging_obj=logging,
-                    )
+                        )
-                else:
+                    else:
-                    response = CustomStreamWrapper(
+                        response = CustomStreamWrapper(
-                        iter(response),
+                            iter(response),
-                        model,
+                            model,
-                        custom_llm_provider="bedrock",
+                            custom_llm_provider="bedrock",
-                        logging_obj=logging,
+                            logging_obj=logging,
-                    )
+                        )
            if optional_params.get("stream", False):
                ## LOGGING
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -984,6 +984,65 @@ def test_vertex_ai_stream():
 #         pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize("sync_mode", [True])
@pytest.mark.asyncio
 async def test_bedrock_cohere_command_r_streaming(sync_mode):
    try:
        litellm.set_verbose = True
        if sync_mode:
            final_chunk: Optional[litellm.ModelResponse] = None
            response: litellm.CustomStreamWrapper = completion(  # type: ignore
                model="bedrock/cohere.command-r-plus-v1:0",
                messages=messages,
                max_tokens=10,  # type: ignore
                stream=True,
            )
            complete_response = ""
            # Add any assertions here to check the response
            has_finish_reason = False
            for idx, chunk in enumerate(response):
                final_chunk = chunk
                chunk, finished = streaming_format_tests(idx, chunk)
                if finished:
                    has_finish_reason = True
                    break
                complete_response += chunk
            if has_finish_reason == False:
                raise Exception("finish reason not set")
            if complete_response.strip() == "":
                raise Exception("Empty response received")
        else:
            response: litellm.CustomStreamWrapper = await litellm.acompletion(  # type: ignore
                model="bedrock/cohere.command-r-plus-v1:0",
                messages=messages,
                max_tokens=100,  # type: ignore
                stream=True,
            )
            complete_response = ""
            # Add any assertions here to check the response
            has_finish_reason = False
            idx = 0
            final_chunk: Optional[litellm.ModelResponse] = None
            async for chunk in response:
                final_chunk = chunk
                chunk, finished = streaming_format_tests(idx, chunk)
                if finished:
                    has_finish_reason = True
                    break
                complete_response += chunk
                idx += 1
            if has_finish_reason == False:
                raise Exception("finish reason not set")
            if complete_response.strip() == "":
                raise Exception("Empty response received")
        print(f"completion_response: {complete_response}\n\nFinalChunk: {final_chunk}")
        raise Exception("it worked!")
    except RateLimitError:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_bedrock_claude_3_streaming():
    try:
        litellm.set_verbose = True
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -1,6 +1,63 @@
-from typing import TypedDict
+from typing import TypedDict, Any
 import json
 from typing_extensions import (
    Self,
    Protocol,
    TypeGuard,
    override,
    get_origin,
    runtime_checkable,
    Required,
 )
 class GenericStreamingChunk(TypedDict):
    text: Required[str]
    is_finished: Required[bool]
    finish_reason: Required[str]
 class Document(TypedDict):
    title: str
    snippet: str
 class ServerSentEvent:
    def __init__(
        self,
        *,
        event: str | None = None,
        data: str | None = None,
        id: str | None = None,
        retry: int | None = None,
    ) -> None:
        if data is None:
            data = ""
        self._id = id
        self._data = data
        self._event = event or None
        self._retry = retry
    @property
    def event(self) -> str | None:
        return self._event
    @property
    def id(self) -> str | None:
        return self._id
    @property
    def retry(self) -> int | None:
        return self._retry
    @property
    def data(self) -> str:
        return self._data
    def json(self) -> Any:
        return json.loads(self.data)
    @override
    def __repr__(self) -> str:
        return f"ServerSentEvent(event={self.event}, data={self.data}, id={self.id}, retry={self.retry})"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -10262,6 +10262,12 @@ class CustomStreamWrapper:
            raise e
    def handle_bedrock_stream(self, chunk):
        if "cohere" in self.model:
            return {
                "text": chunk["text"],
                "is_finished": chunk["is_finished"],
                "finish_reason": chunk["finish_reason"],
            }
        if hasattr(chunk, "get"):
            chunk = chunk.get("chunk")
            chunk_data = json.loads(chunk.get("bytes").decode())
@ -11068,6 +11074,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "gemini"
                or self.custom_llm_provider == "cached_response"
                or self.custom_llm_provider == "predibase"
                or (self.custom_llm_provider == "bedrock" and "cohere" in self.model)
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
                async for chunk in self.completion_stream: