Merge pull request #5343 from BerriAI/litellm_sagemaker_chat

feat(sagemaker.py): add sagemaker messages api support
2024-08-23 21:00:00 -07:00 · 2024-08-23 21:00:00 -07:00 · cd61ddc610
commit cd61ddc610
parent 30ec166556 3f116b25a9
6 changed files with 112 additions and 17 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -669,6 +669,7 @@ provider_list: List = [
    "azure_text",
    "azure_ai",
    "sagemaker",
    "sagemaker_chat",
    "bedrock",
    "vllm",
    "nlp_cloud",
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -235,23 +235,28 @@ class DatabricksChatCompletion(BaseLLM):
        api_base: Optional[str],
        endpoint_type: Literal["chat_completions", "embeddings"],
        custom_endpoint: Optional[bool],
        headers: Optional[dict],
    ) -> Tuple[str, dict]:
-        if api_key is None:
+        if api_key is None and headers is None:
            raise DatabricksError(
                status_code=400,
-                message="Missing Databricks API Key - A call is being made to Databricks but no key is set either in the environment variables (DATABRICKS_API_KEY) or via params",
+                message="Missing API Key - A call is being made to LLM Provider but no key is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
            )
        if api_base is None:
            raise DatabricksError(
                status_code=400,
-                message="Missing Databricks API Base - A call is being made to Databricks but no api base is set either in the environment variables (DATABRICKS_API_BASE) or via params",
+                message="Missing API Base - A call is being made to LLM Provider but no api base is set either in the environment variables ({LLM_PROVIDER}_API_KEY) or via params",
            )
-        headers = {
+        if headers is None:
-            "Authorization": "Bearer {}".format(api_key),
+            headers = {
-            "Content-Type": "application/json",
+                "Authorization": "Bearer {}".format(api_key),
-        }
+                "Content-Type": "application/json",
            }
        else:
            if api_key is not None:
                headers.update({"Authorization": "Bearer {}".format(api_key)})
        if endpoint_type == "chat_completions" and custom_endpoint is not True:
            api_base = "{}/chat/completions".format(api_base)
@ -356,23 +361,27 @@ class DatabricksChatCompletion(BaseLLM):
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
-        api_key,
+        api_key: Optional[str],
        logging_obj,
        optional_params: dict,
        acompletion=None,
        litellm_params=None,
        logger_fn=None,
-        headers={},
+        headers: Optional[dict] = None,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
        custom_endpoint: Optional[bool] = None,
    ):
-        custom_endpoint: Optional[bool] = optional_params.pop("custom_endpoint", None)
+        custom_endpoint = custom_endpoint or optional_params.pop(
            "custom_endpoint", None
        )
        base_model: Optional[str] = optional_params.pop("base_model", None)
        api_base, headers = self._validate_environment(
            api_base=api_base,
            api_key=api_key,
            endpoint_type="chat_completions",
            custom_endpoint=custom_endpoint,
            headers=headers,
        )
        ## Load Config
        config = litellm.DatabricksConfig().get_config()
@ -382,7 +391,7 @@ class DatabricksChatCompletion(BaseLLM):
            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                optional_params[k] = v
-        stream: bool = optional_params.pop("stream", None) or False
+        stream: bool = optional_params.get("stream", None) or False
        optional_params["stream"] = stream
        data = {
@ -565,12 +574,14 @@ class DatabricksChatCompletion(BaseLLM):
        model_response: Optional[litellm.utils.EmbeddingResponse] = None,
        client=None,
        aembedding=None,
        headers: Optional[dict] = None,
    ) -> EmbeddingResponse:
        api_base, headers = self._validate_environment(
            api_base=api_base,
            api_key=api_key,
            endpoint_type="embeddings",
            custom_endpoint=False,
            headers=headers,
        )
        model = model
        data = {"model": model, "input": input, **optional_params}
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -206,17 +206,60 @@ class SagemakerLLM(BaseAWSLLM):
        print_verbose: Callable,
        encoding,
        logging_obj,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        custom_prompt_dict={},
        hf_model_name=None,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
        acompletion: bool = False,
        use_messages_api: Optional[bool] = None,
    ):
        # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
        credentials, aws_region_name = self._load_credentials(optional_params)
        inference_params = deepcopy(optional_params)
        stream = inference_params.pop("stream", None)
        model_id = optional_params.get("model_id", None)
        if use_messages_api is True:
            from litellm.llms.databricks import DatabricksChatCompletion
            openai_like_chat_completions = DatabricksChatCompletion()
            inference_params["stream"] = True if stream is True else False
            _data = {
                "model": model,
                "messages": messages,
                **inference_params,
            }
            prepared_request = self._prepare_request(
                model=model,
                data=_data,
                optional_params=optional_params,
                credentials=credentials,
                aws_region_name=aws_region_name,
            )
            return openai_like_chat_completions.completion(
                model=model,
                messages=messages,
                api_base=prepared_request.url,
                api_key=None,
                custom_prompt_dict=custom_prompt_dict,
                model_response=model_response,
                print_verbose=print_verbose,
                logging_obj=logging_obj,
                optional_params=inference_params,
                acompletion=acompletion,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                timeout=timeout,
                encoding=encoding,
                headers=prepared_request.headers,
                custom_endpoint=True,
                custom_llm_provider="sagemaker_chat",
            )
        ## Load Config
        config = litellm.SagemakerConfig.get_config()
@ -259,8 +302,6 @@ class SagemakerLLM(BaseAWSLLM):
                hf_model_name or model
            )  # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
            prompt = prompt_factory(model=hf_model_name, messages=messages)
        stream = inference_params.pop("stream", None)
        model_id = optional_params.get("model_id", None)
        if stream is True:
            data = {"inputs": prompt, "parameters": inference_params, "stream": True}
@ -275,7 +316,7 @@ class SagemakerLLM(BaseAWSLLM):
                # Add model_id as InferenceComponentName header
                # boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
                prepared_request.headers.update(
-                    {"X-Amzn-SageMaker-Inference-Componen": model_id}
+                    {"X-Amzn-SageMaker-Inference-Component": model_id}
                )
            if acompletion is True:
@ -338,7 +379,7 @@ class SagemakerLLM(BaseAWSLLM):
        )
        # Async completion
-        if acompletion == True:
+        if acompletion is True:
            return self.async_completion(
                prepared_request=prepared_request,
                model_response=model_response,
@ -354,7 +395,7 @@ class SagemakerLLM(BaseAWSLLM):
                # Add model_id as InferenceComponentName header
                # boto3 doc: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_runtime_InvokeEndpoint.html
                prepared_request.headers.update(
-                    {"X-Amzn-SageMaker-Inference-Componen": model_id}
+                    {"X-Amzn-SageMaker-Inference-Component": model_id}
                )
            ## LOGGING
--- a/litellm/main.py
+++ b/litellm/main.py
@ -383,6 +383,7 @@ async def acompletion(
            or custom_llm_provider == "vertex_ai_beta"
            or custom_llm_provider == "gemini"
            or custom_llm_provider == "sagemaker"
            or custom_llm_provider == "sagemaker_chat"
            or custom_llm_provider == "anthropic"
            or custom_llm_provider == "predibase"
            or custom_llm_provider == "bedrock"
@ -2248,7 +2249,10 @@ def completion(
            ## RESPONSE OBJECT
            response = model_response
-        elif custom_llm_provider == "sagemaker":
+        elif (
            custom_llm_provider == "sagemaker"
            or custom_llm_provider == "sagemaker_chat"
        ):
            # boto3 reads keys from .env
            model_response = sagemaker_llm.completion(
                model=model,
@ -2263,6 +2267,9 @@ def completion(
                encoding=encoding,
                logging_obj=logging,
                acompletion=acompletion,
                use_messages_api=(
                    True if custom_llm_provider == "sagemaker_chat" else False
                ),
            )
            if optional_params.get("stream", False):
                ## LOGGING
--- a/litellm/tests/test_sagemaker.py
+++ b/litellm/tests/test_sagemaker.py
@ -84,6 +84,40 @@ async def test_completion_sagemaker(sync_mode):
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio()
@pytest.mark.parametrize(
    "sync_mode",
    [True, False],
 )
 async def test_completion_sagemaker_messages_api(sync_mode):
    try:
        litellm.set_verbose = True
        verbose_logger.setLevel(logging.DEBUG)
        print("testing sagemaker")
        if sync_mode is True:
            resp = litellm.completion(
                model="sagemaker_chat/huggingface-pytorch-tgi-inference-2024-08-23-15-48-59-245",
                messages=[
                    {"role": "user", "content": "hi"},
                ],
                temperature=0.2,
                max_tokens=80,
            )
            print(resp)
        else:
            resp = await litellm.acompletion(
                model="sagemaker_chat/huggingface-pytorch-tgi-inference-2024-08-23-15-48-59-245",
                messages=[
                    {"role": "user", "content": "hi"},
                ],
                temperature=0.2,
                max_tokens=80,
            )
            print(resp)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.asyncio()
@pytest.mark.parametrize("sync_mode", [False, True])
 async def test_completion_sagemaker_stream(sync_mode):
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -10614,6 +10614,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "vertex_ai"
                or self.custom_llm_provider == "vertex_ai_beta"
                or self.custom_llm_provider == "sagemaker"
                or self.custom_llm_provider == "sagemaker_chat"
                or self.custom_llm_provider == "gemini"
                or self.custom_llm_provider == "replicate"
                or self.custom_llm_provider == "cached_response"