refactor sagemaker to be async

2024-08-15 18:18:02 -07:00 · 2024-08-15 18:18:02 -07:00 · df4ea8fba6
commit df4ea8fba6
parent b1aed699ea
5 changed files with 798 additions and 603 deletions
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -7,16 +7,38 @@ import traceback
 import types
 from copy import deepcopy
 from enum import Enum
-from typing import Any, Callable, Optional
+from functools import partial
 from typing import Any, AsyncIterator, Callable, Iterator, List, Optional, Union
 import httpx  # type: ignore
 import requests  # type: ignore
 import litellm
-from litellm.utils import EmbeddingResponse, ModelResponse, Usage, get_secret
+from litellm._logging import verbose_logger
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
    _get_async_httpx_client,
    _get_httpx_client,
 )
 from litellm.types.llms.openai import (
    ChatCompletionToolCallChunk,
    ChatCompletionUsageBlock,
 )
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.utils import (
    CustomStreamWrapper,
    EmbeddingResponse,
    ModelResponse,
    Usage,
    get_secret,
 )
 from .base_aws_llm import BaseAWSLLM
 from .prompt_templates.factory import custom_prompt, prompt_factory
 _response_stream_shape_cache = None
 class SagemakerError(Exception):
    def __init__(self, status_code, message):
@ -31,73 +53,6 @@ class SagemakerError(Exception):
        )  # Call the base class constructor with the parameters it needs
 class TokenIterator:
    def __init__(self, stream, acompletion: bool = False):
        if acompletion == False:
            self.byte_iterator = iter(stream)
        elif acompletion == True:
            self.byte_iterator = stream
        self.buffer = io.BytesIO()
        self.read_pos = 0
        self.end_of_data = False
    def __iter__(self):
        return self
    def __next__(self):
        try:
            while True:
                self.buffer.seek(self.read_pos)
                line = self.buffer.readline()
                if line and line[-1] == ord("\n"):
                    response_obj = {"text": "", "is_finished": False}
                    self.read_pos += len(line) + 1
                    full_line = line[:-1].decode("utf-8")
                    line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
                    if line_data.get("generated_text", None) is not None:
                        self.end_of_data = True
                        response_obj["is_finished"] = True
                    response_obj["text"] = line_data["token"]["text"]
                    return response_obj
                chunk = next(self.byte_iterator)
                self.buffer.seek(0, io.SEEK_END)
                self.buffer.write(chunk["PayloadPart"]["Bytes"])
        except StopIteration as e:
            if self.end_of_data == True:
                raise e  # Re-raise StopIteration
            else:
                self.end_of_data = True
                return "data: [DONE]"
    def __aiter__(self):
        return self
    async def __anext__(self):
        try:
            while True:
                self.buffer.seek(self.read_pos)
                line = self.buffer.readline()
                if line and line[-1] == ord("\n"):
                    response_obj = {"text": "", "is_finished": False}
                    self.read_pos += len(line) + 1
                    full_line = line[:-1].decode("utf-8")
                    line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
                    if line_data.get("generated_text", None) is not None:
                        self.end_of_data = True
                        response_obj["is_finished"] = True
                    response_obj["text"] = line_data["token"]["text"]
                    return response_obj
                chunk = await self.byte_iterator.__anext__()
                self.buffer.seek(0, io.SEEK_END)
                self.buffer.write(chunk["PayloadPart"]["Bytes"])
        except StopAsyncIteration as e:
            if self.end_of_data == True:
                raise e  # Re-raise StopIteration
            else:
                self.end_of_data = True
                return "data: [DONE]"
 class SagemakerConfig:
    """
    Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
@ -145,10 +100,89 @@ os.environ['AWS_ACCESS_KEY_ID'] = ""
 os.environ['AWS_SECRET_ACCESS_KEY'] = ""
 """
 # set os.environ['AWS_REGION_NAME'] = <your-region_name>
 class SagemakerLLM(BaseAWSLLM):
    def _prepare_request(
        self,
        model: str,
        data: dict,
        optional_params: dict,
        extra_headers: Optional[dict] = None,
    ):
        try:
            import boto3
            from botocore.auth import SigV4Auth
            from botocore.awsrequest import AWSRequest
            from botocore.credentials import Credentials
        except ImportError as e:
            raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_session_token, aws_region_name from kwargs, since completion calls fail with them
        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
        aws_session_token = optional_params.pop("aws_session_token", None)
        aws_region_name = optional_params.pop("aws_region_name", None)
        aws_role_name = optional_params.pop("aws_role_name", None)
        aws_session_name = optional_params.pop("aws_session_name", None)
        aws_profile_name = optional_params.pop("aws_profile_name", None)
        aws_bedrock_runtime_endpoint = optional_params.pop(
            "aws_bedrock_runtime_endpoint", None
        )  # https://bedrock-runtime.{region_name}.amazonaws.com
        aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
        aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)
-def completion(
+        ### SET REGION NAME ###
        if aws_region_name is None:
            # check env #
            litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
            if litellm_aws_region_name is not None and isinstance(
                litellm_aws_region_name, str
            ):
                aws_region_name = litellm_aws_region_name
            standard_aws_region_name = get_secret("AWS_REGION", None)
            if standard_aws_region_name is not None and isinstance(
                standard_aws_region_name, str
            ):
                aws_region_name = standard_aws_region_name
            if aws_region_name is None:
                aws_region_name = "us-west-2"
        credentials: Credentials = self.get_credentials(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            aws_session_token=aws_session_token,
            aws_region_name=aws_region_name,
            aws_session_name=aws_session_name,
            aws_profile_name=aws_profile_name,
            aws_role_name=aws_role_name,
            aws_web_identity_token=aws_web_identity_token,
            aws_sts_endpoint=aws_sts_endpoint,
        )
        sigv4 = SigV4Auth(credentials, "sagemaker", aws_region_name)
        if optional_params.get("stream") is True:
            api_base = f"https://runtime.sagemaker.{aws_region_name}.amazonaws.com/endpoints/{model}/invocations-response-stream"
        else:
            api_base = f"https://runtime.sagemaker.{aws_region_name}.amazonaws.com/endpoints/{model}/invocations"
        encoded_data = json.dumps(data).encode("utf-8")
        headers = {"Content-Type": "application/json"}
        if extra_headers is not None:
            headers = {"Content-Type": "application/json", **extra_headers}
        request = AWSRequest(
            method="POST", url=api_base, data=encoded_data, headers=headers
        )
        sigv4.add_auth(request)
        prepped_request = request.prepare()
        return prepped_request
    def completion(
        self,
        model: str,
        messages: list,
        model_response: ModelResponse,
@ -161,39 +195,7 @@ def completion(
        litellm_params=None,
        logger_fn=None,
        acompletion: bool = False,
-):
+    ):
    import boto3
    # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
    aws_region_name = optional_params.pop("aws_region_name", None)
    model_id = optional_params.pop("model_id", None)
    if aws_access_key_id != None:
        # uses auth params passed to completion
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
        client = boto3.client(
            service_name="sagemaker-runtime",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=aws_region_name,
        )
    else:
        # aws_access_key_id is None, assume user is trying to auth using env variables
        # boto3 automaticaly reads env variables
        # we need to read region name from env
        # I assume majority of users use .env for auth
        region_name = (
            get_secret("AWS_REGION_NAME")
            or aws_region_name  # get region from config file if specified
            or "us-west-2"  # default to us-west-2 if region not specified
        )
        client = boto3.client(
            service_name="sagemaker-runtime",
            region_name=region_name,
        )
        # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
        inference_params = deepcopy(optional_params)
@ -206,13 +208,14 @@ def completion(
            ):  # completion(top_k=3) > sagemaker_config(top_k=3) <- allows for dynamic variables to be passed in
                inference_params[k] = v
    model = model
        if model in custom_prompt_dict:
            # check if the model has a registered custom prompt
            model_prompt_details = custom_prompt_dict[model]
            prompt = custom_prompt(
                role_dict=model_prompt_details.get("roles", None),
-            initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
+                initial_prompt_value=model_prompt_details.get(
                    "initial_prompt_value", ""
                ),
                final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
                messages=messages,
            )
@ -221,7 +224,9 @@ def completion(
            model_prompt_details = custom_prompt_dict[hf_model_name]
            prompt = custom_prompt(
                role_dict=model_prompt_details.get("roles", None),
-            initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
+                initial_prompt_value=model_prompt_details.get(
                    "initial_prompt_value", ""
                ),
                final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
                messages=messages,
            )
@ -237,12 +242,25 @@ def completion(
            )  # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
            prompt = prompt_factory(model=hf_model_name, messages=messages)
        stream = inference_params.pop("stream", None)
-    if stream == True:
+        model_id = optional_params.get("model_id", None)
-        data = json.dumps(
+
-            {"inputs": prompt, "parameters": inference_params, "stream": True}
+        if stream is True:
-        ).encode("utf-8")
+            data = {"inputs": prompt, "parameters": inference_params, "stream": True}
-        if acompletion == True:
+            prepared_request = self._prepare_request(
-            response = async_streaming(
+                model=model,
                data=data,
                optional_params=optional_params,
            )
            if model_id is not None:
                # Add model_id as InferenceComponentName header
                # boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
                prepared_request.headers.update(
                    {"X-Amzn-SageMaker-Inference-Componen": model_id}
                )
            if acompletion is True:
                response = self.async_streaming(
                    prepared_request=prepared_request,
                    optional_params=optional_params,
                    encoding=encoding,
                    model_response=model_response,
@ -250,99 +268,104 @@ def completion(
                    logging_obj=logging_obj,
                    data=data,
                    model_id=model_id,
                aws_secret_access_key=aws_secret_access_key,
                aws_access_key_id=aws_access_key_id,
                aws_region_name=aws_region_name,
                )
                return response
        if model_id is not None:
            response = client.invoke_endpoint_with_response_stream(
                EndpointName=model,
                InferenceComponentName=model_id,
                ContentType="application/json",
                Body=data,
                CustomAttributes="accept_eula=true",
            )
            else:
-            response = client.invoke_endpoint_with_response_stream(
+                if stream is not None and stream == True:
-                EndpointName=model,
+                    sync_handler = _get_httpx_client()
-                ContentType="application/json",
+                    sync_response = sync_handler.post(
-                Body=data,
+                        url=prepared_request.url,
-                CustomAttributes="accept_eula=true",
+                        headers=prepared_request.headers,  # type: ignore
                        json=data,
                        stream=stream,
                    )
-        return response["Body"]
+
-    elif acompletion == True:
+                    if sync_response.status_code != 200:
                        raise SagemakerError(
                            status_code=sync_response.status_code,
                            message=sync_response.read(),
                        )
                    decoder = AWSEventStreamDecoder(model="")
                    completion_stream = decoder.iter_bytes(
                        sync_response.iter_bytes(chunk_size=1024)
                    )
                    streaming_response = CustomStreamWrapper(
                        completion_stream=completion_stream,
                        model=model,
                        custom_llm_provider="sagemaker",
                        logging_obj=logging_obj,
                    )
            ## LOGGING
            logging_obj.post_call(
                input=messages,
                api_key="",
                original_response=streaming_response,
                additional_args={"complete_input_dict": data},
            )
            return streaming_response
        # Non-Streaming Requests
        _data = {"inputs": prompt, "parameters": inference_params}
-        return async_completion(
+        prepared_request = self._prepare_request(
            model=model,
            data=_data,
            optional_params=optional_params,
-            encoding=encoding,
+        )
        # Async completion
        if acompletion == True:
            return self.async_completion(
                prepared_request=prepared_request,
                model_response=model_response,
                encoding=encoding,
                model=model,
                logging_obj=logging_obj,
                data=_data,
                model_id=model_id,
            aws_secret_access_key=aws_secret_access_key,
            aws_access_key_id=aws_access_key_id,
            aws_region_name=aws_region_name,
            )
-    data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
+        ## Non-Streaming completion CALL
        "utf-8"
    )
    ## COMPLETION CALL
        try:
            if model_id is not None:
-            ## LOGGING
+                # Add model_id as InferenceComponentName header
-            request_str = f"""
+                # boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
-            response = client.invoke_endpoint(
+                prepared_request.headers.update(
-                EndpointName={model},
+                    {"X-Amzn-SageMaker-Inference-Componen": model_id}
                InferenceComponentName={model_id},
                ContentType="application/json",
                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
                )
-            """  # type: ignore
+
            ## LOGGING
            timeout = 300.0
            sync_handler = _get_httpx_client()
            ## LOGGING
            logging_obj.pre_call(
-                input=prompt,
+                input=[],
                api_key="",
                additional_args={
-                    "complete_input_dict": data,
+                    "complete_input_dict": _data,
-                    "request_str": request_str,
+                    "api_base": prepared_request.url,
-                    "hf_model_name": hf_model_name,
+                    "headers": prepared_request.headers,
                },
            )
-            response = client.invoke_endpoint(
+
-                EndpointName=model,
+            # make sync httpx post request here
-                InferenceComponentName=model_id,
+            try:
-                ContentType="application/json",
+                sync_response = sync_handler.post(
-                Body=data,
+                    url=prepared_request.url,
-                CustomAttributes="accept_eula=true",
+                    headers=prepared_request.headers,
                    json=_data,
                    timeout=timeout,
                )
-        else:
+            except Exception as e:
                ## LOGGING
-            request_str = f"""
+                logging_obj.post_call(
-            response = client.invoke_endpoint(
+                    input=[],
                EndpointName={model},
                ContentType="application/json",
                Body={data}, # type: ignore
                CustomAttributes="accept_eula=true",
            )
            """  # type: ignore
            logging_obj.pre_call(
                input=prompt,
                    api_key="",
-                additional_args={
+                    original_response=str(e),
-                    "complete_input_dict": data,
+                    additional_args={"complete_input_dict": _data},
                    "request_str": request_str,
                    "hf_model_name": hf_model_name,
                },
            )
            response = client.invoke_endpoint(
                EndpointName=model,
                ContentType="application/json",
                Body=data,
                CustomAttributes="accept_eula=true",
                )
                raise e
        except Exception as e:
            status_code = (
                getattr(e, "response", {})
@ -356,17 +379,16 @@ def completion(
                error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
            raise SagemakerError(status_code=status_code, message=error_message)
-    response = response["Body"].read().decode("utf8")
+        completion_response = sync_response.json()
        ## LOGGING
        logging_obj.post_call(
            input=prompt,
            api_key="",
-        original_response=response,
+            original_response=completion_response,
-        additional_args={"complete_input_dict": data},
+            additional_args={"complete_input_dict": _data},
        )
        print_verbose(f"raw model_response: {response}")
        ## RESPONSE OBJECT
    completion_response = json.loads(response)
        try:
            if isinstance(completion_response, list):
                completion_response_choices = completion_response[0]
@ -405,8 +427,57 @@ def completion(
        setattr(model_response, "usage", usage)
        return model_response
    async def make_async_call(
        self,
        api_base: str,
        headers: dict,
        data: str,
        logging_obj,
        client=None,
    ):
        try:
            if client is None:
                client = (
                    _get_async_httpx_client()
                )  # Create a new client if none provided
            response = await client.post(
                api_base,
                headers=headers,
                json=data,
                stream=True,
            )
-async def async_streaming(
+            if response.status_code != 200:
                raise SagemakerError(
                    status_code=response.status_code, message=response.text
                )
            decoder = AWSEventStreamDecoder(model="")
            completion_stream = decoder.aiter_bytes(
                response.aiter_bytes(chunk_size=1024)
            )
            return completion_stream
            # LOGGING
            logging_obj.post_call(
                input=[],
                api_key="",
                original_response="first stream response received",
                additional_args={"complete_input_dict": data},
            )
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
            raise SagemakerError(status_code=error_code, message=err.response.text)
        except httpx.TimeoutException as e:
            raise SagemakerError(status_code=408, message="Timeout error occurred.")
        except Exception as e:
            raise SagemakerError(status_code=500, message=str(e))
    async def async_streaming(
        self,
        prepared_request,
        optional_params,
        encoding,
        model_response: ModelResponse,
@ -414,170 +485,83 @@ async def async_streaming(
        model_id: Optional[str],
        logging_obj: Any,
        data,
-    aws_secret_access_key: Optional[str],
+    ):
-    aws_access_key_id: Optional[str],
+        streaming_response = CustomStreamWrapper(
-    aws_region_name: Optional[str],
+            completion_stream=None,
-):
+            make_call=partial(
-    """
+                self.make_async_call,
-    Use aioboto3
+                api_base=prepared_request.url,
-    """
+                headers=prepared_request.headers,
-    import aioboto3
+                data=data,
-
+                logging_obj=logging_obj,
-    session = aioboto3.Session()
+            ),
-
+            model=model,
-    if aws_access_key_id != None:
+            custom_llm_provider="sagemaker",
-        # uses auth params passed to completion
+            logging_obj=logging_obj,
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
        _client = session.client(
            service_name="sagemaker-runtime",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=aws_region_name,
        )
    else:
        # aws_access_key_id is None, assume user is trying to auth using env variables
        # boto3 automaticaly reads env variables
        # we need to read region name from env
        # I assume majority of users use .env for auth
        region_name = (
            get_secret("AWS_REGION_NAME")
            or aws_region_name  # get region from config file if specified
            or "us-west-2"  # default to us-west-2 if region not specified
        )
        _client = session.client(
            service_name="sagemaker-runtime",
            region_name=region_name,
        )
-    async with _client as client:
+        # LOGGING
-        try:
+        logging_obj.post_call(
-            if model_id is not None:
+            input=[],
-                response = await client.invoke_endpoint_with_response_stream(
+            api_key="",
-                    EndpointName=model,
+            original_response="first stream response received",
-                    InferenceComponentName=model_id,
+            additional_args={"complete_input_dict": data},
                    ContentType="application/json",
                    Body=data,
                    CustomAttributes="accept_eula=true",
        )
            else:
                response = await client.invoke_endpoint_with_response_stream(
                    EndpointName=model,
                    ContentType="application/json",
                    Body=data,
                    CustomAttributes="accept_eula=true",
                )
        except Exception as e:
            raise SagemakerError(status_code=500, message=f"{str(e)}")
        response = response["Body"]
        async for chunk in response:
            yield chunk
        return streaming_response
-async def async_completion(
+    async def async_completion(
-    optional_params,
+        self,
        prepared_request,
        encoding,
        model_response: ModelResponse,
        model: str,
        logging_obj: Any,
        data: dict,
        model_id: Optional[str],
-    aws_secret_access_key: Optional[str],
+    ):
-    aws_access_key_id: Optional[str],
+        timeout = 300.0
-    aws_region_name: Optional[str],
+        async_handler = _get_async_httpx_client()
-):
+        ## LOGGING
-    """
+        logging_obj.pre_call(
-    Use aioboto3
+            input=[],
-    """
+            api_key="",
-    import aioboto3
+            additional_args={
-
+                "complete_input_dict": data,
-    session = aioboto3.Session()
+                "api_base": prepared_request.url,
-
+                "headers": prepared_request.headers,
-    if aws_access_key_id != None:
+            },
        # uses auth params passed to completion
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
        _client = session.client(
            service_name="sagemaker-runtime",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=aws_region_name,
        )
    else:
        # aws_access_key_id is None, assume user is trying to auth using env variables
        # boto3 automaticaly reads env variables
        # we need to read region name from env
        # I assume majority of users use .env for auth
        region_name = (
            get_secret("AWS_REGION_NAME")
            or aws_region_name  # get region from config file if specified
            or "us-west-2"  # default to us-west-2 if region not specified
        )
        _client = session.client(
            service_name="sagemaker-runtime",
            region_name=region_name,
        )
    async with _client as client:
        encoded_data = json.dumps(data).encode("utf-8")
        try:
            if model_id is not None:
-                ## LOGGING
+                # Add model_id as InferenceComponentName header
-                request_str = f"""
+                # boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
-                response = client.invoke_endpoint(
+                prepared_request.headers.update(
-                    EndpointName={model},
+                    {"X-Amzn-SageMaker-Inference-Componen": model_id}
                    InferenceComponentName={model_id},
                    ContentType="application/json",
                    Body={data},
                    CustomAttributes="accept_eula=true",
                )
-                """  # type: ignore
+            # make async httpx post request here
-                logging_obj.pre_call(
+            try:
                response = await async_handler.post(
                    url=prepared_request.url,
                    headers=prepared_request.headers,
                    json=data,
                    timeout=timeout,
                )
            except Exception as e:
                ## LOGGING
                logging_obj.post_call(
                    input=data["inputs"],
                    api_key="",
-                    additional_args={
+                    original_response=str(e),
-                        "complete_input_dict": data,
+                    additional_args={"complete_input_dict": data},
                        "request_str": request_str,
                    },
                )
                response = await client.invoke_endpoint(
                    EndpointName=model,
                    InferenceComponentName=model_id,
                    ContentType="application/json",
                    Body=encoded_data,
                    CustomAttributes="accept_eula=true",
                )
            else:
                ## LOGGING
                request_str = f"""
                response = client.invoke_endpoint(
                    EndpointName={model},
                    ContentType="application/json",
                    Body={data},
                    CustomAttributes="accept_eula=true",
                )
                """  # type: ignore
                logging_obj.pre_call(
                    input=data["inputs"],
                    api_key="",
                    additional_args={
                        "complete_input_dict": data,
                        "request_str": request_str,
                    },
                )
                response = await client.invoke_endpoint(
                    EndpointName=model,
                    ContentType="application/json",
                    Body=encoded_data,
                    CustomAttributes="accept_eula=true",
                )
                raise e
        except Exception as e:
            error_message = f"{str(e)}"
            if "Inference Component Name header is required" in error_message:
                error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
            raise SagemakerError(status_code=500, message=error_message)
-        response = await response["Body"].read()
+        completion_response = response.json()
        response = response.decode("utf8")
        ## LOGGING
        logging_obj.post_call(
            input=data["inputs"],
@ -586,7 +570,6 @@ async def async_completion(
            additional_args={"complete_input_dict": data},
        )
        ## RESPONSE OBJECT
        completion_response = json.loads(response)
        try:
            if isinstance(completion_response, list):
                completion_response_choices = completion_response[0]
@ -625,8 +608,8 @@ async def async_completion(
        setattr(model_response, "usage", usage)
        return model_response
-
+    def embedding(
-def embedding(
+        self,
        model: str,
        input: list,
        model_response: EmbeddingResponse,
@ -637,7 +620,7 @@ def embedding(
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
-):
+    ):
        """
        Supports Huggingface Jumpstart embeddings like GPT-6B
        """
@ -732,12 +715,15 @@ def embedding(
        print_verbose(f"raw model_response: {response}")
        if "embedding" not in response:
-        raise SagemakerError(status_code=500, message="embedding not found in response")
+            raise SagemakerError(
                status_code=500, message="embedding not found in response"
            )
        embeddings = response["embedding"]
        if not isinstance(embeddings, list):
            raise SagemakerError(
-            status_code=422, message=f"Response not in expected format - {embeddings}"
+                status_code=422,
                message=f"Response not in expected format - {embeddings}",
            )
        output_data = []
@ -758,8 +744,160 @@ def embedding(
            model_response,
            "usage",
            Usage(
-            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+                prompt_tokens=input_tokens,
                completion_tokens=0,
                total_tokens=input_tokens,
            ),
        )
        return model_response
 def get_response_stream_shape():
    global _response_stream_shape_cache
    if _response_stream_shape_cache is None:
        from botocore.loaders import Loader
        from botocore.model import ServiceModel
        loader = Loader()
        sagemaker_service_dict = loader.load_service_model(
            "sagemaker-runtime", "service-2"
        )
        sagemaker_service_model = ServiceModel(sagemaker_service_dict)
        _response_stream_shape_cache = sagemaker_service_model.shape_for(
            "InvokeEndpointWithResponseStreamOutput"
        )
    return _response_stream_shape_cache
 class AWSEventStreamDecoder:
    def __init__(self, model: str) -> None:
        from botocore.parsers import EventStreamJSONParser
        self.model = model
        self.parser = EventStreamJSONParser()
        self.content_blocks: List = []
    def _chunk_parser(self, chunk_data: dict) -> GChunk:
        verbose_logger.debug("in sagemaker chunk parser, chunk_data %s", chunk_data)
        _token = chunk_data["token"]
        _index = chunk_data["index"]
        is_finished = False
        finish_reason = ""
        if _token["text"] == "<|endoftext|>":
            return GChunk(
                text="",
                index=_index,
                is_finished=True,
                finish_reason="stop",
            )
        return GChunk(
            text=_token["text"],
            index=_index,
            is_finished=is_finished,
            finish_reason=finish_reason,
        )
    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GChunk]:
        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
        event_stream_buffer = EventStreamBuffer()
        accumulated_json = ""
        for chunk in iterator:
            event_stream_buffer.add_data(chunk)
            for event in event_stream_buffer:
                message = self._parse_message_from_event(event)
                if message:
                    # remove data: prefix and "\n\n" at the end
                    message = message.replace("data:", "").replace("\n\n", "")
                    # Accumulate JSON data
                    accumulated_json += message
                    # Try to parse the accumulated JSON
                    try:
                        _data = json.loads(accumulated_json)
                        yield self._chunk_parser(chunk_data=_data)
                        # Reset accumulated_json after successful parsing
                        accumulated_json = ""
                    except json.JSONDecodeError:
                        # If it's not valid JSON yet, continue to the next event
                        continue
        # Handle any remaining data after the iterator is exhausted
        if accumulated_json:
            try:
                _data = json.loads(accumulated_json)
                yield self._chunk_parser(chunk_data=_data)
            except json.JSONDecodeError:
                # Handle or log any unparseable data at the end
                verbose_logger.error(
                    f"Warning: Unparseable JSON data remained: {accumulated_json}"
                )
    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
    ) -> AsyncIterator[GChunk]:
        """Given an async iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer
        event_stream_buffer = EventStreamBuffer()
        accumulated_json = ""
        async for chunk in iterator:
            event_stream_buffer.add_data(chunk)
            for event in event_stream_buffer:
                message = self._parse_message_from_event(event)
                if message:
                    verbose_logger.debug("sagemaker  parsed chunk bytes %s", message)
                    # remove data: prefix and "\n\n" at the end
                    message = message.replace("data:", "").replace("\n\n", "")
                    # Accumulate JSON data
                    accumulated_json += message
                    # Try to parse the accumulated JSON
                    try:
                        _data = json.loads(accumulated_json)
                        yield self._chunk_parser(chunk_data=_data)
                        # Reset accumulated_json after successful parsing
                        accumulated_json = ""
                    except json.JSONDecodeError:
                        # If it's not valid JSON yet, continue to the next event
                        continue
        # Handle any remaining data after the iterator is exhausted
        if accumulated_json:
            try:
                _data = json.loads(accumulated_json)
                yield self._chunk_parser(chunk_data=_data)
            except json.JSONDecodeError:
                # Handle or log any unparseable data at the end
                verbose_logger.error(
                    f"Warning: Unparseable JSON data remained: {accumulated_json}"
                )
    def _parse_message_from_event(self, event) -> Optional[str]:
        response_dict = event.to_response_dict()
        parsed_response = self.parser.parse(response_dict, get_response_stream_shape())
        if response_dict["status_code"] != 200:
            raise ValueError(f"Bad response code, expected 200: {response_dict}")
        if "chunk" in parsed_response:
            chunk = parsed_response.get("chunk")
            if not chunk:
                return None
            return chunk.get("bytes").decode()  # type: ignore[no-any-return]
        else:
            chunk = response_dict.get("body")
            if not chunk:
                return None
            return chunk.decode()  # type: ignore[no-any-return]
--- a/litellm/main.py
+++ b/litellm/main.py
@ -95,7 +95,6 @@ from .llms import (
    palm,
    petals,
    replicate,
    sagemaker,
    together_ai,
    triton,
    vertex_ai,
@ -120,6 +119,7 @@ from .llms.prompt_templates.factory import (
    prompt_factory,
    stringify_json_tool_call_content,
 )
 from .llms.sagemaker import SagemakerLLM
 from .llms.text_completion_codestral import CodestralTextCompletion
 from .llms.triton import TritonChatCompletion
 from .llms.vertex_ai_partner import VertexAIPartnerModels
@ -166,6 +166,7 @@ bedrock_converse_chat_completion = BedrockConverseLLM()
 vertex_chat_completion = VertexLLM()
 vertex_partner_models_chat_completion = VertexAIPartnerModels()
 watsonxai = IBMWatsonXAI()
 sagemaker_llm = SagemakerLLM()
 ####### COMPLETION ENDPOINTS ################
@ -2216,7 +2217,7 @@ def completion(
            response = model_response
        elif custom_llm_provider == "sagemaker":
            # boto3 reads keys from .env
-            model_response = sagemaker.completion(
+            model_response = sagemaker_llm.completion(
                model=model,
                messages=messages,
                model_response=model_response,
@ -2230,26 +2231,13 @@ def completion(
                logging_obj=logging,
                acompletion=acompletion,
            )
-            if (
+            if optional_params.get("stream", False):
                "stream" in optional_params and optional_params["stream"] == True
            ):  ## [BETA]
                print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
                from .llms.sagemaker import TokenIterator
                tokenIterator = TokenIterator(model_response, acompletion=acompletion)
                response = CustomStreamWrapper(
                    completion_stream=tokenIterator,
                    model=model,
                    custom_llm_provider="sagemaker",
                    logging_obj=logging,
                )
                ## LOGGING
                logging.post_call(
                    input=messages,
                    api_key=None,
-                    original_response=response,
+                    original_response=model_response,
                )
                return response
            ## RESPONSE OBJECT
            response = model_response
@ -3529,7 +3517,7 @@ def embedding(
                model_response=EmbeddingResponse(),
            )
        elif custom_llm_provider == "sagemaker":
-            response = sagemaker.embedding(
+            response = sagemaker_llm.embedding(
                model=model,
                input=input,
                encoding=encoding,
--- a/litellm/tests/test_sagemaker.py
+++ b/litellm/tests/test_sagemaker.py
@ -28,6 +28,9 @@ litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
 messages = [{"content": user_message, "role": "user"}]
 import logging
 from litellm._logging import verbose_logger
 def logger_fn(user_model_dict):
@ -80,6 +83,55 @@ async def test_completion_sagemaker(sync_mode):
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio()
@pytest.mark.parametrize("sync_mode", [True])
 async def test_completion_sagemaker_stream(sync_mode):
    try:
        litellm.set_verbose = False
        print("testing sagemaker")
        verbose_logger.setLevel(logging.DEBUG)
        full_text = ""
        if sync_mode is True:
            response = litellm.completion(
                model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
                messages=[
                    {"role": "user", "content": "hi - what is ur name"},
                ],
                temperature=0.2,
                stream=True,
                max_tokens=80,
                input_cost_per_second=0.000420,
            )
            for chunk in response:
                print(chunk)
                full_text += chunk.choices[0].delta.content or ""
            print("SYNC RESPONSE full text", full_text)
        else:
            response = await litellm.acompletion(
                model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
                messages=[
                    {"role": "user", "content": "hi - what is ur name"},
                ],
                stream=True,
                temperature=0.2,
                max_tokens=80,
                input_cost_per_second=0.000420,
            )
            print("streaming response")
            async for chunk in response:
                print(chunk)
                full_text += chunk.choices[0].delta.content or ""
            print("ASYNC RESPONSE full text", full_text)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio
 async def test_acompletion_sagemaker_non_stream():
    mock_response = AsyncMock()
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -80,7 +80,7 @@ class ModelInfo(TypedDict, total=False):
    supports_assistant_prefill: Optional[bool]
-class GenericStreamingChunk(TypedDict):
+class GenericStreamingChunk(TypedDict, total=False):
    text: Required[str]
    tool_use: Optional[ChatCompletionToolCallChunk]
    is_finished: Required[bool]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -9848,11 +9848,28 @@ class CustomStreamWrapper:
                    completion_obj["tool_calls"] = [response_obj["tool_use"]]
            elif self.custom_llm_provider == "sagemaker":
-                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                from litellm.types.llms.bedrock import GenericStreamingChunk
-                response_obj = self.handle_sagemaker_stream(chunk)
+
                if self.received_finish_reason is not None:
                    raise StopIteration
                response_obj: GenericStreamingChunk = chunk
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
                if (
                    self.stream_options
                    and self.stream_options.get("include_usage", False) is True
                    and response_obj["usage"] is not None
                ):
                    model_response.usage = litellm.Usage(
                        prompt_tokens=response_obj["usage"]["inputTokens"],
                        completion_tokens=response_obj["usage"]["outputTokens"],
                        total_tokens=response_obj["usage"]["totalTokens"],
                    )
                if "tool_use" in response_obj and response_obj["tool_use"] is not None:
                    completion_obj["tool_calls"] = [response_obj["tool_use"]]
            elif self.custom_llm_provider == "petals":
                if len(self.completion_stream) == 0:
                    if self.received_finish_reason is not None: