Litellm dev bedrock anthropic 3 7 v2 (#8843)

* feat(bedrock/converse/transformation.py): support claude-3-7-sonnet reasoning_Content transformation

Closes https://github.com/BerriAI/litellm/issues/8777

* fix(bedrock/): support returning `reasoning_content` on streaming for claude-3-7

Resolves https://github.com/BerriAI/litellm/issues/8777

* feat(bedrock/): unify converse reasoning content blocks for consistency across anthropic and bedrock

* fix(anthropic/chat/transformation.py): handle deepseek-style 'reasoning_content' extraction within transformation.py

simpler logic

* feat(bedrock/): fix streaming to return blocks in consistent format

* fix: fix linting error

* test: fix test

* feat(factory.py): fix bedrock thinking block translation on tool calling

allows passing the thinking blocks back to bedrock for tool calling

* fix(types/utils.py): don't exclude provider_specific_fields on model dump

ensures consistent responses

* fix: fix linting errors

* fix(convert_dict_to_response.py): pass reasoning_content on root

* fix: test

* fix(streaming_handler.py): add helper util for setting model id

* fix(streaming_handler.py): fix setting model id on model response stream chunk

* fix(streaming_handler.py): fix linting error

* fix(streaming_handler.py): fix linting error

* fix(types/utils.py): add provider_specific_fields to model stream response

* fix(streaming_handler.py): copy provider specific fields and add them to the root of the streaming response

* fix(streaming_handler.py): fix check

* fix: fix test

* fix(types/utils.py): ensure messages content is always openai compatible

* fix(types/utils.py): fix delta object to always be openai compatible

only introduce new params if variable exists

* test: fix bedrock nova tests

* test: skip flaky test

* test: skip flaky test in ci/cd
This commit is contained in:
Krish Dholakia 2025-02-26 16:05:33 -08:00 committed by GitHub
parent f3ef6c92a3
commit 05a973bf19
20 changed files with 447 additions and 149 deletions

View file

@ -26,7 +26,6 @@ import httpx # type: ignore
import litellm
from litellm import verbose_logger
from litellm._logging import print_verbose
from litellm.caching.caching import InMemoryCache
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.litellm_logging import Logging
@ -51,13 +50,19 @@ from litellm.llms.custom_httpx.http_handler import (
)
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
ChatCompletionThinkingBlock,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import ChatCompletionMessageToolCall, Choices
from litellm.types.utils import ChatCompletionMessageToolCall, Choices, Delta
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import ModelResponse, ModelResponseStream, Usage
from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
StreamingChoices,
Usage,
)
from litellm.utils import CustomStreamWrapper, get_secret
from ..base_aws_llm import BaseAWSLLM
@ -212,7 +217,6 @@ async def make_call(
api_key="",
data=data,
messages=messages,
print_verbose=print_verbose,
encoding=litellm.encoding,
) # type: ignore
completion_stream: Any = MockResponseIterator(
@ -298,7 +302,6 @@ def make_sync_call(
api_key="",
data=data,
messages=messages,
print_verbose=print_verbose,
encoding=litellm.encoding,
) # type: ignore
completion_stream: Any = MockResponseIterator(
@ -525,7 +528,7 @@ class BedrockLLM(BaseAWSLLM):
].message.tool_calls:
_tool_call = {**tool_call.dict(), "index": 0}
_tool_calls.append(_tool_call)
delta_obj = litellm.utils.Delta(
delta_obj = Delta(
content=getattr(
model_response.choices[0].message, "content", None
),
@ -1258,14 +1261,37 @@ class AWSEventStreamDecoder:
return True
return False
def converse_chunk_parser(self, chunk_data: dict) -> GChunk:
def extract_reasoning_content_str(
self, reasoning_content_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[str]:
if "text" in reasoning_content_block:
return reasoning_content_block["text"]
return None
def translate_thinking_blocks(
self, thinking_block: BedrockConverseReasoningContentBlockDelta
) -> Optional[List[ChatCompletionThinkingBlock]]:
"""
Translate the thinking blocks to a string
"""
thinking_blocks_list: List[ChatCompletionThinkingBlock] = []
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
if "text" in thinking_block:
_thinking_block["thinking"] = thinking_block["text"]
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
try:
verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data))
text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None
is_finished = False
finish_reason = ""
usage: Optional[ChatCompletionUsageBlock] = None
provider_specific_fields: dict = {}
reasoning_content: Optional[str] = None
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
index = int(chunk_data.get("contentBlockIndex", 0))
if "start" in chunk_data:
@ -1305,6 +1331,16 @@ class AWSEventStreamDecoder:
},
"index": index,
}
elif "reasoningContent" in delta_obj:
provider_specific_fields = {
"reasoningContent": delta_obj["reasoningContent"],
}
reasoning_content = self.extract_reasoning_content_str(
delta_obj["reasoningContent"]
)
thinking_blocks = self.translate_thinking_blocks(
delta_obj["reasoningContent"]
)
elif (
"contentBlockIndex" in chunk_data
): # stop block, no 'start' or 'delta' object
@ -1321,7 +1357,6 @@ class AWSEventStreamDecoder:
}
elif "stopReason" in chunk_data:
finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
is_finished = True
elif "usage" in chunk_data:
usage = ChatCompletionUsageBlock(
prompt_tokens=chunk_data.get("inputTokens", 0),
@ -1329,18 +1364,33 @@ class AWSEventStreamDecoder:
total_tokens=chunk_data.get("totalTokens", 0),
)
response = GChunk(
text=text,
tool_use=tool_use,
is_finished=is_finished,
finish_reason=finish_reason,
usage=usage,
index=index,
)
model_response_provider_specific_fields = {}
if "trace" in chunk_data:
trace = chunk_data.get("trace")
response["provider_specific_fields"] = {"trace": trace}
model_response_provider_specific_fields["trace"] = trace
response = ModelResponseStream(
choices=[
StreamingChoices(
finish_reason=finish_reason,
index=index,
delta=Delta(
content=text,
role="assistant",
tool_calls=[tool_use] if tool_use else None,
provider_specific_fields=(
provider_specific_fields
if provider_specific_fields
else None
),
thinking_blocks=thinking_blocks,
reasoning_content=reasoning_content,
),
)
],
usage=usage,
provider_specific_fields=model_response_provider_specific_fields,
)
return response
except Exception as e:
raise Exception("Received streaming error - {}".format(str(e)))
@ -1486,7 +1536,7 @@ class AmazonAnthropicClaudeStreamDecoder(AWSEventStreamDecoder):
sync_stream=sync_stream,
)
def _chunk_parser(self, chunk_data: dict) -> GChunk:
def _chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
return self.anthropic_model_response_iterator.chunk_parser(chunk=chunk_data)