feat(bedrock_httpx.py): working bedrock command-r sync+async streaming

2024-05-11 19:39:51 -07:00 · 2024-05-11 19:39:51 -07:00 · 64650c0279
commit 64650c0279
parent 49ab1a1d3f
6 changed files with 342 additions and 51 deletions
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -7,7 +7,18 @@ import json
 from enum import Enum
 import requests, copy  # type: ignore
 import time
-from typing import Callable, Optional, List, Literal, Union, Any, TypedDict, Tuple
+from typing import (
+    Callable,
+    Optional,
+    List,
+    Literal,
+    Union,
+    Any,
+    TypedDict,
+    Tuple,
+    Iterator,
+    AsyncIterator,
+)
 from litellm.utils import (
    ModelResponse,
    Usage,
@ -330,10 +341,10 @@ class BedrockLLM(BaseLLM):
        encoding,
        logging_obj,
        optional_params: dict,
+        acompletion: bool,
        timeout: Optional[Union[float, httpx.Timeout]],
        litellm_params=None,
        logger_fn=None,
-        acompletion: bool = False,
        extra_headers: Optional[dict] = None,
        client: Optional[Union[AsyncHTTPHandler, HTTPHandler]] = None,
    ) -> Union[ModelResponse, CustomStreamWrapper]:
@ -346,6 +357,9 @@ class BedrockLLM(BaseLLM):
        except ImportError as e:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")

+        ## SETUP ##
+        stream = optional_params.pop("stream", None)
+
        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
@ -400,7 +414,10 @@ class BedrockLLM(BaseLLM):
        else:
            endpoint_url = f"https://bedrock-runtime.{aws_region_name}.amazonaws.com"

-        endpoint_url = f"{endpoint_url}/model/{model}/invoke"
+        if stream is not None and stream == True:
+            endpoint_url = f"{endpoint_url}/model/{model}/invoke-with-response-stream"
+        else:
+            endpoint_url = f"{endpoint_url}/model/{model}/invoke"

        sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)

@ -409,7 +426,6 @@ class BedrockLLM(BaseLLM):
            model, messages, provider, custom_prompt_dict
        )
        inference_params = copy.deepcopy(optional_params)
-        stream = inference_params.pop("stream", False)

        if provider == "cohere":
            if model.startswith("cohere.command-r"):
@ -420,11 +436,6 @@ class BedrockLLM(BaseLLM):
                        k not in inference_params
                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                        inference_params[k] = v
-                if optional_params.get("stream", False) == True:
-                    inference_params["stream"] = (
-                        True  # cohere requires stream = True in inference params
-                    )
-
                _data = {"message": prompt, **inference_params}
                if chat_history is not None:
                    _data["chat_history"] = chat_history
@ -437,7 +448,7 @@ class BedrockLLM(BaseLLM):
                        k not in inference_params
                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                        inference_params[k] = v
-                if optional_params.get("stream", False) == True:
+                if stream == True:
                    inference_params["stream"] = (
                        True  # cohere requires stream = True in inference params
                    )
@ -446,6 +457,7 @@ class BedrockLLM(BaseLLM):
            raise Exception("UNSUPPORTED PROVIDER")

        ## COMPLETION CALL
+
        headers = {"Content-Type": "application/json"}
        if extra_headers is not None:
            headers = {"Content-Type": "application/json", **extra_headers}
@ -455,11 +467,39 @@ class BedrockLLM(BaseLLM):
        sigv4.add_auth(request)
        prepped = request.prepare()

+        ## LOGGING
+        logging_obj.pre_call(
+            input=messages,
+            api_key="",
+            additional_args={
+                "complete_input_dict": data,
+                "api_base": prepped.url,
+                "headers": prepped.headers,
+            },
+        )
+
        ### ROUTING (ASYNC, STREAMING, SYNC)
        if acompletion:
            if isinstance(client, HTTPHandler):
                client = None
-
+            if stream:
+                return self.async_streaming(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=prepped.url,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=True,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=prepped.headers,
+                    timeout=timeout,
+                    client=client,
+                )  # type: ignore
            ### ASYNC COMPLETION
            return self.async_completion(
                model=model,
@ -488,17 +528,29 @@ class BedrockLLM(BaseLLM):
            self.client = HTTPHandler(**_params)  # type: ignore
        else:
            self.client = client
+        if stream is not None and stream == True:
+            response = self.client.post(
+                url=prepped.url,
+                headers=prepped.headers,  # type: ignore
+                data=data,
+                stream=stream,
+            )

-        ## LOGGING
-        logging_obj.pre_call(
-            input=messages,
-            api_key="",
-            additional_args={
-                "complete_input_dict": data,
-                "api_base": prepped.url,
-                "headers": prepped.headers,
-            },
-        )
+            if response.status_code != 200:
+                raise BedrockError(
+                    status_code=response.status_code, message=response.text
+                )
+
+            decoder = AWSEventStreamDecoder()
+
+            completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
+            streaming_response = CustomStreamWrapper(
+                completion_stream=completion_stream,
+                model=model,
+                custom_llm_provider="bedrock",
+                logging_obj=logging_obj,
+            )
+            return streaming_response

        response = self.client.post(url=prepped.url, headers=prepped.headers, data=data)  # type: ignore

@ -565,5 +617,117 @@ class BedrockLLM(BaseLLM):
            encoding=encoding,
        )

+    async def async_streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        data: str,
+        timeout: Optional[Union[float, httpx.Timeout]],
+        encoding,
+        logging_obj,
+        stream,
+        optional_params: dict,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+        client: Optional[AsyncHTTPHandler] = None,
+    ) -> CustomStreamWrapper:
+        if client is None:
+            _params = {}
+            if timeout is not None:
+                if isinstance(timeout, float) or isinstance(timeout, int):
+                    timeout = httpx.Timeout(timeout)
+                _params["timeout"] = timeout
+            self.client = AsyncHTTPHandler(**_params)  # type: ignore
+        else:
+            self.client = client  # type: ignore
+
+        response = await self.client.post(api_base, headers=headers, data=data, stream=True)  # type: ignore
+
+        if response.status_code != 200:
+            raise BedrockError(status_code=response.status_code, message=response.text)
+
+        decoder = AWSEventStreamDecoder()
+
+        completion_stream = decoder.aiter_bytes(response.aiter_bytes(chunk_size=1024))
+        streaming_response = CustomStreamWrapper(
+            completion_stream=completion_stream,
+            model=model,
+            custom_llm_provider="bedrock",
+            logging_obj=logging_obj,
+        )
+        return streaming_response
+
    def embedding(self, *args, **kwargs):
        return super().embedding(*args, **kwargs)
+
+
+def get_response_stream_shape():
+    from botocore.model import ServiceModel
+    from botocore.loaders import Loader
+
+    loader = Loader()
+    bedrock_service_dict = loader.load_service_model("bedrock-runtime", "service-2")
+    bedrock_service_model = ServiceModel(bedrock_service_dict)
+    return bedrock_service_model.shape_for("ResponseStream")
+
+
+class AWSEventStreamDecoder:
+    def __init__(self) -> None:
+        from botocore.parsers import EventStreamJSONParser
+
+        self.parser = EventStreamJSONParser()
+
+    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
+        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
+        from botocore.eventstream import EventStreamBuffer
+
+        event_stream_buffer = EventStreamBuffer()
+        for chunk in iterator:
+            event_stream_buffer.add_data(chunk)
+            for event in event_stream_buffer:
+                message = self._parse_message_from_event(event)
+                if message:
+                    # sse_event = ServerSentEvent(data=message, event="completion")
+                    _data = json.loads(message)
+                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
+                        text=_data.get("text", ""),
+                        is_finished=_data.get("is_finished", False),
+                        finish_reason=_data.get("finish_reason", ""),
+                    )
+                    yield streaming_chunk
+
+    async def aiter_bytes(
+        self, iterator: AsyncIterator[bytes]
+    ) -> AsyncIterator[GenericStreamingChunk]:
+        """Given an async iterator that yields lines, iterate over it & yield every event encountered"""
+        from botocore.eventstream import EventStreamBuffer
+
+        event_stream_buffer = EventStreamBuffer()
+        async for chunk in iterator:
+            event_stream_buffer.add_data(chunk)
+            for event in event_stream_buffer:
+                message = self._parse_message_from_event(event)
+                if message:
+                    _data = json.loads(message)
+                    streaming_chunk: GenericStreamingChunk = GenericStreamingChunk(
+                        text=_data.get("text", ""),
+                        is_finished=_data.get("is_finished", False),
+                        finish_reason=_data.get("finish_reason", ""),
+                    )
+                    yield streaming_chunk
+
+    def _parse_message_from_event(self, event) -> str | None:
+        response_dict = event.to_response_dict()
+        parsed_response = self.parser.parse(response_dict, get_response_stream_shape())
+        if response_dict["status_code"] != 200:
+            raise ValueError(f"Bad response code, expected 200: {response_dict}")
+
+        chunk = parsed_response.get("chunk")
+        if not chunk:
+            return None
+
+        return chunk.get("bytes").decode()  # type: ignore[no-any-return]
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -91,11 +91,15 @@ class HTTPHandler:
    def post(
        self,
        url: str,
-        data: Optional[dict] = None,
+        data: Optional[Union[dict, str]] = None,
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
+        stream: bool = False,
    ):
-        response = self.client.post(url, data=data, params=params, headers=headers)
+        req = self.client.build_request(
+            "POST", url, data=data, params=params, headers=headers  # type: ignore
+        )
+        response = self.client.send(req, stream=stream)
        return response

    def __del__(self) -> None:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -257,7 +257,7 @@ async def acompletion(
        - If `stream` is True, the function returns an async generator that yields completion lines.
    """
    loop = asyncio.get_event_loop()
-    custom_llm_provider = None
+    custom_llm_provider = kwargs.get("custom_llm_provider", None)
    # Adjusted to use explicit arguments instead of *args and **kwargs
    completion_kwargs = {
        "model": model,
@ -289,9 +289,10 @@ async def acompletion(
        "model_list": model_list,
        "acompletion": True,  # assuming this is a required parameter
    }
-    _, custom_llm_provider, _, _ = get_llm_provider(
-        model=model, api_base=completion_kwargs.get("base_url", None)
-    )
+    if custom_llm_provider is None:
+        _, custom_llm_provider, _, _ = get_llm_provider(
+            model=model, api_base=completion_kwargs.get("base_url", None)
+        )
    try:
        # Use a partial function to pass your keyword arguments
        func = partial(completion, **completion_kwargs, **kwargs)
@ -300,9 +301,6 @@ async def acompletion(
        ctx = contextvars.copy_context()
        func_with_context = partial(ctx.run, func)

-        _, custom_llm_provider, _, _ = get_llm_provider(
-            model=model, api_base=kwargs.get("api_base", None)
-        )
        if (
            custom_llm_provider == "openai"
            or custom_llm_provider == "azure"
@ -324,6 +322,7 @@ async def acompletion(
            or custom_llm_provider == "sagemaker"
            or custom_llm_provider == "anthropic"
            or custom_llm_provider == "predibase"
+            or (custom_llm_provider == "bedrock" and "cohere" in model)
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
@ -1937,6 +1936,7 @@ def completion(
                    logging_obj=logging,
                    extra_headers=extra_headers,
                    timeout=timeout,
+                    acompletion=acompletion,
                )
            else:
                response = bedrock.completion(
@ -1954,26 +1954,26 @@ def completion(
                    timeout=timeout,
                )

-            if (
-                "stream" in optional_params
-                and optional_params["stream"] == True
-                and not isinstance(response, CustomStreamWrapper)
-            ):
-                # don't try to access stream object,
-                if "ai21" in model:
-                    response = CustomStreamWrapper(
-                        response,
-                        model,
-                        custom_llm_provider="bedrock",
-                        logging_obj=logging,
-                    )
-                else:
-                    response = CustomStreamWrapper(
-                        iter(response),
-                        model,
-                        custom_llm_provider="bedrock",
-                        logging_obj=logging,
-                    )
+                if (
+                    "stream" in optional_params
+                    and optional_params["stream"] == True
+                    and not isinstance(response, CustomStreamWrapper)
+                ):
+                    # don't try to access stream object,
+                    if "ai21" in model:
+                        response = CustomStreamWrapper(
+                            response,
+                            model,
+                            custom_llm_provider="bedrock",
+                            logging_obj=logging,
+                        )
+                    else:
+                        response = CustomStreamWrapper(
+                            iter(response),
+                            model,
+                            custom_llm_provider="bedrock",
+                            logging_obj=logging,
+                        )

            if optional_params.get("stream", False):
                ## LOGGING
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -984,6 +984,65 @@ def test_vertex_ai_stream():
 #         pytest.fail(f"Error occurred: {e}")


+@pytest.mark.parametrize("sync_mode", [True])
+@pytest.mark.asyncio
+async def test_bedrock_cohere_command_r_streaming(sync_mode):
+    try:
+        litellm.set_verbose = True
+        if sync_mode:
+            final_chunk: Optional[litellm.ModelResponse] = None
+            response: litellm.CustomStreamWrapper = completion(  # type: ignore
+                model="bedrock/cohere.command-r-plus-v1:0",
+                messages=messages,
+                max_tokens=10,  # type: ignore
+                stream=True,
+            )
+            complete_response = ""
+            # Add any assertions here to check the response
+            has_finish_reason = False
+            for idx, chunk in enumerate(response):
+                final_chunk = chunk
+                chunk, finished = streaming_format_tests(idx, chunk)
+                if finished:
+                    has_finish_reason = True
+                    break
+                complete_response += chunk
+            if has_finish_reason == False:
+                raise Exception("finish reason not set")
+            if complete_response.strip() == "":
+                raise Exception("Empty response received")
+        else:
+            response: litellm.CustomStreamWrapper = await litellm.acompletion(  # type: ignore
+                model="bedrock/cohere.command-r-plus-v1:0",
+                messages=messages,
+                max_tokens=100,  # type: ignore
+                stream=True,
+            )
+            complete_response = ""
+            # Add any assertions here to check the response
+            has_finish_reason = False
+            idx = 0
+            final_chunk: Optional[litellm.ModelResponse] = None
+            async for chunk in response:
+                final_chunk = chunk
+                chunk, finished = streaming_format_tests(idx, chunk)
+                if finished:
+                    has_finish_reason = True
+                    break
+                complete_response += chunk
+                idx += 1
+            if has_finish_reason == False:
+                raise Exception("finish reason not set")
+            if complete_response.strip() == "":
+                raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}\n\nFinalChunk: {final_chunk}")
+        raise Exception("it worked!")
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_bedrock_claude_3_streaming():
    try:
        litellm.set_verbose = True
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -1,6 +1,63 @@
-from typing import TypedDict
+from typing import TypedDict, Any
+import json
+from typing_extensions import (
+    Self,
+    Protocol,
+    TypeGuard,
+    override,
+    get_origin,
+    runtime_checkable,
+    Required,
+)
+
+
+class GenericStreamingChunk(TypedDict):
+    text: Required[str]
+    is_finished: Required[bool]
+    finish_reason: Required[str]


 class Document(TypedDict):
    title: str
    snippet: str
+
+
+class ServerSentEvent:
+    def __init__(
+        self,
+        *,
+        event: str | None = None,
+        data: str | None = None,
+        id: str | None = None,
+        retry: int | None = None,
+    ) -> None:
+        if data is None:
+            data = ""
+
+        self._id = id
+        self._data = data
+        self._event = event or None
+        self._retry = retry
+
+    @property
+    def event(self) -> str | None:
+        return self._event
+
+    @property
+    def id(self) -> str | None:
+        return self._id
+
+    @property
+    def retry(self) -> int | None:
+        return self._retry
+
+    @property
+    def data(self) -> str:
+        return self._data
+
+    def json(self) -> Any:
+        return json.loads(self.data)
+
+    @override
+    def __repr__(self) -> str:
+        return f"ServerSentEvent(event={self.event}, data={self.data}, id={self.id}, retry={self.retry})"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -10262,6 +10262,12 @@ class CustomStreamWrapper:
            raise e

    def handle_bedrock_stream(self, chunk):
+        if "cohere" in self.model:
+            return {
+                "text": chunk["text"],
+                "is_finished": chunk["is_finished"],
+                "finish_reason": chunk["finish_reason"],
+            }
        if hasattr(chunk, "get"):
            chunk = chunk.get("chunk")
            chunk_data = json.loads(chunk.get("bytes").decode())
@ -11068,6 +11074,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "gemini"
                or self.custom_llm_provider == "cached_response"
                or self.custom_llm_provider == "predibase"
+                or (self.custom_llm_provider == "bedrock" and "cohere" in self.model)
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
                async for chunk in self.completion_stream: