Litellm dev 03 05 2025 p3 (#9023)

* fix(invoke_handler.py): fix converse streaming - return signature + ensure consistency with anthropic api response * build(model_prices_and_context_window.json): fix anthropic api claude-3-7 max output tokens with beta header this is 128k Resolves https://github.com/BerriAI/litellm/issues/8964 * feat(handler.py): handle new anthropic 'thinking_delta' block on streaming Fixes https://github.com/BerriAI/litellm/issues/8825
2025-04-24 18:24:20 +00:00 · 2025-03-05 22:31:39 -08:00 · 2025-03-05 22:31:39 -08:00 · 744e10b0f0
commit 744e10b0f0
parent f6535ae6ad
5 changed files with 26 additions and 11 deletions
--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -474,7 +474,10 @@ class ModelResponseIterator:
        if len(self.content_blocks) == 0:
            return False

-        if self.content_blocks[0]["delta"]["type"] == "text_delta":
+        if (
+            self.content_blocks[0]["delta"]["type"] == "text_delta"
+            or self.content_blocks[0]["delta"]["type"] == "thinking_delta"
+        ):
            return False

        for block in self.content_blocks:
@ -617,9 +620,11 @@ class ModelResponseIterator:
                        "index": self.tool_index,
                    }
            elif type_chunk == "content_block_stop":
+
                ContentBlockStop(**chunk)  # type: ignore
                # check if tool call content block
                is_empty = self.check_empty_tool_call_args()
+
                if is_empty:
                    tool_use = {
                        "id": None,
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -1260,6 +1260,9 @@ class AWSEventStreamDecoder:
        _thinking_block = ChatCompletionThinkingBlock(type="thinking")
        if "text" in thinking_block:
            _thinking_block["thinking"] = thinking_block["text"]
+        elif "signature" in thinking_block:
+            _thinking_block["signature"] = thinking_block["signature"]
+            _thinking_block["thinking"] = ""  # consistent with anthropic response
        thinking_blocks_list.append(_thinking_block)
        return thinking_blocks_list

@ -1322,6 +1325,12 @@ class AWSEventStreamDecoder:
                    thinking_blocks = self.translate_thinking_blocks(
                        delta_obj["reasoningContent"]
                    )
+                    if (
+                        thinking_blocks
+                        and len(thinking_blocks) > 0
+                        and reasoning_content is None
+                    ):
+                        reasoning_content = ""  # set to non-empty string to ensure consistency with Anthropic
            elif (
                "contentBlockIndex" in chunk_data
            ):  # stop block, no 'start' or 'delta' object
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2913,9 +2913,9 @@
        "supports_tool_choice": true
    },
    "claude-3-7-sonnet-latest": {
-        "max_tokens": 8192,
+        "max_tokens": 128000,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 128000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
@ -2932,9 +2932,9 @@
        "supports_tool_choice": true
    },
    "claude-3-7-sonnet-20250219": {
-        "max_tokens": 8192,
+        "max_tokens": 128000,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 128000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2913,9 +2913,9 @@
        "supports_tool_choice": true
    },
    "claude-3-7-sonnet-latest": {
-        "max_tokens": 8192,
+        "max_tokens": 128000,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 128000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
@ -2932,9 +2932,9 @@
        "supports_tool_choice": true
    },
    "claude-3-7-sonnet-20250219": {
-        "max_tokens": 8192,
+        "max_tokens": 128000,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 128000,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "cache_creation_input_token_cost": 0.00000375,
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@ -1196,10 +1196,11 @@ def test_anthropic_thinking_output(model):
    [
        "anthropic/claude-3-7-sonnet-20250219",
        # "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+        # "bedrock/invoke/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    ],
 )
 def test_anthropic_thinking_output_stream(model):
-    # litellm.set_verbose = True
+    litellm.set_verbose = True
    try:
        # litellm._turn_on_debug()
        resp = litellm.completion(
@ -1207,7 +1208,7 @@ def test_anthropic_thinking_output_stream(model):
            messages=[{"role": "user", "content": "Tell me a joke."}],
            stream=True,
            thinking={"type": "enabled", "budget_tokens": 1024},
-            timeout=5,
+            timeout=10,
        )

        reasoning_content_exists = False