fix(bedrock_httpx.py): move bedrock ai21 calls to being async

2025-04-25 18:54:30 +00:00 · 2024-05-16 22:21:30 -07:00 · 2024-05-16 22:21:30 -07:00 · 0293f7766a
commit 0293f7766a
parent 180bc46ca4
5 changed files with 88 additions and 71 deletions
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -421,7 +421,12 @@ class BedrockLLM(BaseLLM):
                    setattr(model_response, "usage", _usage)
                else:
                    outputText = completion_response["completion"]
+
                    model_response["finish_reason"] = completion_response["stop_reason"]
+            elif provider == "ai21":
+                outputText = (
+                    completion_response.get("completions")[0].get("data").get("text")
+                )
        except Exception as e:
            raise BedrockError(
                message="Error processing={}, Received error={}".format(
@ -430,6 +435,49 @@ class BedrockLLM(BaseLLM):
                status_code=422,
            )

+        try:
+            if (
+                len(outputText) > 0
+                and hasattr(model_response.choices[0], "message")
+                and getattr(model_response.choices[0].message, "tool_calls", None)
+                is None
+            ):
+                model_response["choices"][0]["message"]["content"] = outputText
+            elif (
+                hasattr(model_response.choices[0], "message")
+                and getattr(model_response.choices[0].message, "tool_calls", None)
+                is not None
+            ):
+                pass
+            else:
+                raise Exception()
+        except:
+            raise BedrockError(
+                message=json.dumps(outputText), status_code=response.status_code
+            )
+
+        if stream and provider == "ai21":
+            streaming_model_response = ModelResponse(stream=True)
+            streaming_model_response.choices[0].finish_reason = model_response.choices[  # type: ignore
+                0
+            ].finish_reason
+            # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
+            streaming_choice = litellm.utils.StreamingChoices()
+            streaming_choice.index = model_response.choices[0].index
+            delta_obj = litellm.utils.Delta(
+                content=getattr(model_response.choices[0].message, "content", None),
+                role=model_response.choices[0].message.role,
+            )
+            streaming_choice.delta = delta_obj
+            streaming_model_response.choices = [streaming_choice]
+            mri = ModelResponseIterator(model_response=streaming_model_response)
+            return CustomStreamWrapper(
+                completion_stream=mri,
+                model=model,
+                custom_llm_provider="cached_response",
+                logging_obj=logging_obj,
+            )
+
        ## CALCULATING USAGE - bedrock returns usage in the headers
        bedrock_input_tokens = response.headers.get(
            "x-amzn-bedrock-input-token-count", None
@ -489,6 +537,7 @@ class BedrockLLM(BaseLLM):

        ## SETUP ##
        stream = optional_params.pop("stream", None)
+        provider = model.split(".")[0]

        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
@ -544,14 +593,13 @@ class BedrockLLM(BaseLLM):
        else:
            endpoint_url = f"https://bedrock-runtime.{aws_region_name}.amazonaws.com"

-        if stream is not None and stream == True:
+        if (stream is not None and stream == True) and provider != "ai21":
            endpoint_url = f"{endpoint_url}/model/{model}/invoke-with-response-stream"
        else:
            endpoint_url = f"{endpoint_url}/model/{model}/invoke"

        sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)

-        provider = model.split(".")[0]
        prompt, chat_history = self.convert_messages_to_prompt(
            model, messages, provider, custom_prompt_dict
        )
@ -633,6 +681,16 @@ class BedrockLLM(BaseLLM):
                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                        inference_params[k] = v
                data = json.dumps({"prompt": prompt, **inference_params})
+        elif provider == "ai21":
+            ## LOAD CONFIG
+            config = litellm.AmazonAI21Config.get_config()
+            for k, v in config.items():
+                if (
+                    k not in inference_params
+                ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                    inference_params[k] = v
+
+            data = json.dumps({"prompt": prompt, **inference_params})
        else:
            raise Exception("UNSUPPORTED PROVIDER")

@ -662,7 +720,7 @@ class BedrockLLM(BaseLLM):
        if acompletion:
            if isinstance(client, HTTPHandler):
                client = None
-            if stream:
+            if stream == True and provider != "ai21":
                return self.async_streaming(
                    model=model,
                    messages=messages,
@ -691,7 +749,7 @@ class BedrockLLM(BaseLLM):
                encoding=encoding,
                logging_obj=logging_obj,
                optional_params=optional_params,
-                stream=False,
+                stream=stream,  # type: ignore
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                headers=prepped.headers,
@ -708,7 +766,7 @@ class BedrockLLM(BaseLLM):
            self.client = HTTPHandler(**_params)  # type: ignore
        else:
            self.client = client
-        if stream is not None and stream == True:
+        if (stream is not None and stream == True) and provider != "ai21":
            response = self.client.post(
                url=prepped.url,
                headers=prepped.headers,  # type: ignore
@ -787,7 +845,7 @@ class BedrockLLM(BaseLLM):
            model=model,
            response=response,
            model_response=model_response,
-            stream=stream,
+            stream=stream if isinstance(stream, bool) else False,
            logging_obj=logging_obj,
            api_key="",
            data=data,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -328,7 +328,7 @@ async def acompletion(
            or custom_llm_provider == "predibase"
            or (
                custom_llm_provider == "bedrock"
-                and ("cohere" in model or "anthropic" in model)
+                and ("cohere" in model or "anthropic" in model or "ai21" in model)
            )
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
@ -1982,7 +1982,7 @@ def completion(
            # boto3 reads keys from .env
            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict

-            if "cohere" in model or "anthropic" in model:
+            if "cohere" in model or "anthropic" in model or "ai21" in model:
                response = bedrock_chat_completion.completion(
                    model=model,
                    messages=messages,
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -2665,7 +2665,12 @@ def response_format_tests(response: litellm.ModelResponse):
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize(
    "model",
-    ["bedrock/cohere.command-r-plus-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"],
+    [
+        "bedrock/cohere.command-r-plus-v1:0",
+        "anthropic.claude-3-sonnet-20240229-v1:0",
+        "anthropic.claude-instant-v1",
+        "bedrock/ai21.j2-mid",
+    ],
 )
@pytest.mark.asyncio
 async def test_completion_bedrock_httpx_models(sync_mode, model):
@ -2675,6 +2680,8 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
        response = completion(
            model=model,
            messages=[{"role": "user", "content": "Hey! how's it going?"}],
+            temperature=0.2,
+            max_tokens=200,
        )

        assert isinstance(response, litellm.ModelResponse)
@ -2684,6 +2691,8 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
        response = await litellm.acompletion(
            model=model,
            messages=[{"role": "user", "content": "Hey! how's it going?"}],
+            temperature=0.2,
+            max_tokens=200,
        )

        assert isinstance(response, litellm.ModelResponse)
@ -2740,48 +2749,9 @@ def test_completion_bedrock_titan():
 # test_completion_bedrock_titan()


-def test_completion_bedrock_claude():
-    print("calling claude")
-    try:
-        response = completion(
-            model="anthropic.claude-instant-v1",
-            messages=messages,
-            max_tokens=10,
-            temperature=0.1,
-            logger_fn=logger_fn,
-        )
-        # Add any assertions here to check the response
-        print(response)
-    except RateLimitError:
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 # test_completion_bedrock_claude()


-def test_completion_bedrock_cohere():
-    print("calling bedrock cohere")
-    litellm.set_verbose = True
-    try:
-        response = completion(
-            model="bedrock/cohere.command-text-v14",
-            messages=[{"role": "user", "content": "hi"}],
-            temperature=0.1,
-            max_tokens=10,
-            stream=True,
-        )
-        # Add any assertions here to check the response
-        print(response)
-        for chunk in response:
-            print(chunk)
-    except RateLimitError:
-        pass
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-
 # test_completion_bedrock_cohere()


@ -2804,23 +2774,6 @@ def test_completion_bedrock_cohere():
 #         pytest.fail(f"Error occurred: {e}")
 # test_completion_bedrock_claude_stream()

-# def test_completion_bedrock_ai21():
-#     try:
-#         litellm.set_verbose = False
-#         response = completion(
-#             model="bedrock/ai21.j2-mid",
-#             messages=messages,
-#             temperature=0.2,
-#             top_p=0.2,
-#             max_tokens=20
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#     except RateLimitError:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
-

 ######## Test VLLM ########
 # def test_completion_vllm():
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1044,8 +1044,10 @@ async def test_completion_replicate_llama3_streaming(sync_mode):
@pytest.mark.parametrize(
    "model",
    [
-        # "bedrock/cohere.command-r-plus-v1:0",
-        "anthropic.claude-3-sonnet-20240229-v1:0"
+        "bedrock/cohere.command-r-plus-v1:0",
+        "anthropic.claude-3-sonnet-20240229-v1:0",
+        "anthropic.claude-instant-v1",
+        "bedrock/ai21.j2-mid",
    ],
 )
@pytest.mark.asyncio
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -11510,7 +11510,11 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "predibase"
                or (
                    self.custom_llm_provider == "bedrock"
-                    and ("cohere" in self.model or "anthropic" in self.model)
+                    and (
+                        "cohere" in self.model
+                        or "anthropic" in self.model
+                        or "ai21" in self.model
+                    )
                )
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):