Litellm dev bedrock anthropic 3 7 v2 (#8843)

* feat(bedrock/converse/transformation.py): support claude-3-7-sonnet reasoning_Content transformation Closes https://github.com/BerriAI/litellm/issues/8777 * fix(bedrock/): support returning `reasoning_content` on streaming for claude-3-7 Resolves https://github.com/BerriAI/litellm/issues/8777 * feat(bedrock/): unify converse reasoning content blocks for consistency across anthropic and bedrock * fix(anthropic/chat/transformation.py): handle deepseek-style 'reasoning_content' extraction within transformation.py simpler logic * feat(bedrock/): fix streaming to return blocks in consistent format * fix: fix linting error * test: fix test * feat(factory.py): fix bedrock thinking block translation on tool calling allows passing the thinking blocks back to bedrock for tool calling * fix(types/utils.py): don't exclude provider_specific_fields on model dump ensures consistent responses * fix: fix linting errors * fix(convert_dict_to_response.py): pass reasoning_content on root * fix: test * fix(streaming_handler.py): add helper util for setting model id * fix(streaming_handler.py): fix setting model id on model response stream chunk * fix(streaming_handler.py): fix linting error * fix(streaming_handler.py): fix linting error * fix(types/utils.py): add provider_specific_fields to model stream response * fix(streaming_handler.py): copy provider specific fields and add them to the root of the streaming response * fix(streaming_handler.py): fix check * fix: fix test * fix(types/utils.py): ensure messages content is always openai compatible * fix(types/utils.py): fix delta object to always be openai compatible only introduce new params if variable exists * test: fix bedrock nova tests * test: skip flaky test * test: skip flaky test in ci/cd
2025-04-26 11:14:04 +00:00 · 2025-02-26 16:05:33 -08:00 · 2025-02-26 16:05:33 -08:00 · 05a973bf19
commit 05a973bf19
parent f3ef6c92a3
20 changed files with 447 additions and 149 deletions
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -26,7 +26,6 @@ import httpx  # type: ignore

 import litellm
 from litellm import verbose_logger
-from litellm._logging import print_verbose
 from litellm.caching.caching import InMemoryCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.litellm_logging import Logging
@ -51,13 +50,19 @@ from litellm.llms.custom_httpx.http_handler import (
 )
 from litellm.types.llms.bedrock import *
 from litellm.types.llms.openai import (
+    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionUsageBlock,
 )
-from litellm.types.utils import ChatCompletionMessageToolCall, Choices
+from litellm.types.utils import ChatCompletionMessageToolCall, Choices, Delta
 from litellm.types.utils import GenericStreamingChunk as GChunk
-from litellm.types.utils import ModelResponse, ModelResponseStream, Usage
+from litellm.types.utils import (
+    ModelResponse,
+    ModelResponseStream,
+    StreamingChoices,
+    Usage,
+)
 from litellm.utils import CustomStreamWrapper, get_secret

 from ..base_aws_llm import BaseAWSLLM
@ -212,7 +217,6 @@ async def make_call(
                api_key="",
                data=data,
                messages=messages,
-                print_verbose=print_verbose,
                encoding=litellm.encoding,
            )  # type: ignore
            completion_stream: Any = MockResponseIterator(
@ -298,7 +302,6 @@ def make_sync_call(
                api_key="",
                data=data,
                messages=messages,
-                print_verbose=print_verbose,
                encoding=litellm.encoding,
            )  # type: ignore
            completion_stream: Any = MockResponseIterator(
@ -525,7 +528,7 @@ class BedrockLLM(BaseAWSLLM):
                                ].message.tool_calls:
                                    _tool_call = {**tool_call.dict(), "index": 0}
                                    _tool_calls.append(_tool_call)
-                            delta_obj = litellm.utils.Delta(
+                            delta_obj = Delta(
                                content=getattr(
                                    model_response.choices[0].message, "content", None
                                ),
@ -1258,14 +1261,37 @@ class AWSEventStreamDecoder:
            return True
        return False

-    def converse_chunk_parser(self, chunk_data: dict) -> GChunk:
+    def extract_reasoning_content_str(
+        self, reasoning_content_block: BedrockConverseReasoningContentBlockDelta
+    ) -> Optional[str]:
+        if "text" in reasoning_content_block:
+            return reasoning_content_block["text"]
+        return None
+
+    def translate_thinking_blocks(
+        self, thinking_block: BedrockConverseReasoningContentBlockDelta
+    ) -> Optional[List[ChatCompletionThinkingBlock]]:
+        """
+        Translate the thinking blocks to a string
+        """
+
+        thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
+        _thinking_block = ChatCompletionThinkingBlock(type="thinking")
+        if "text" in thinking_block:
+            _thinking_block["thinking"] = thinking_block["text"]
+        thinking_blocks_list.append(_thinking_block)
+        return thinking_blocks_list
+
+    def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
        try:
            verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data))
            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
-            is_finished = False
            finish_reason = ""
            usage: Optional[ChatCompletionUsageBlock] = None
+            provider_specific_fields: dict = {}
+            reasoning_content: Optional[str] = None
+            thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None

            index = int(chunk_data.get("contentBlockIndex", 0))
            if "start" in chunk_data:
@ -1305,6 +1331,16 @@ class AWSEventStreamDecoder:
                        },
                        "index": index,
                    }
+                elif "reasoningContent" in delta_obj:
+                    provider_specific_fields = {
+                        "reasoningContent": delta_obj["reasoningContent"],
+                    }
+                    reasoning_content = self.extract_reasoning_content_str(
+                        delta_obj["reasoningContent"]
+                    )
+                    thinking_blocks = self.translate_thinking_blocks(
+                        delta_obj["reasoningContent"]
+                    )
            elif (
                "contentBlockIndex" in chunk_data
            ):  # stop block, no 'start' or 'delta' object
@ -1321,7 +1357,6 @@ class AWSEventStreamDecoder:
                    }
            elif "stopReason" in chunk_data:
                finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
-                is_finished = True
            elif "usage" in chunk_data:
                usage = ChatCompletionUsageBlock(
                    prompt_tokens=chunk_data.get("inputTokens", 0),
@ -1329,18 +1364,33 @@ class AWSEventStreamDecoder:
                    total_tokens=chunk_data.get("totalTokens", 0),
                )

-            response = GChunk(
-                text=text,
-                tool_use=tool_use,
-                is_finished=is_finished,
-                finish_reason=finish_reason,
-                usage=usage,
-                index=index,
-            )
-
+            model_response_provider_specific_fields = {}
            if "trace" in chunk_data:
                trace = chunk_data.get("trace")
-                response["provider_specific_fields"] = {"trace": trace}
+                model_response_provider_specific_fields["trace"] = trace
+            response = ModelResponseStream(
+                choices=[
+                    StreamingChoices(
+                        finish_reason=finish_reason,
+                        index=index,
+                        delta=Delta(
+                            content=text,
+                            role="assistant",
+                            tool_calls=[tool_use] if tool_use else None,
+                            provider_specific_fields=(
+                                provider_specific_fields
+                                if provider_specific_fields
+                                else None
+                            ),
+                            thinking_blocks=thinking_blocks,
+                            reasoning_content=reasoning_content,
+                        ),
+                    )
+                ],
+                usage=usage,
+                provider_specific_fields=model_response_provider_specific_fields,
+            )
+
            return response
        except Exception as e:
            raise Exception("Received streaming error - {}".format(str(e)))
@ -1486,7 +1536,7 @@ class AmazonAnthropicClaudeStreamDecoder(AWSEventStreamDecoder):
            sync_stream=sync_stream,
        )

-    def _chunk_parser(self, chunk_data: dict) -> GChunk:
+    def _chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
        return self.anthropic_model_response_iterator.chunk_parser(chunk=chunk_data)