Litellm dev 03 05 2025 p3 (#9023)

* fix(invoke_handler.py): fix converse streaming - return signature + ensure consistency with anthropic api response

* build(model_prices_and_context_window.json): fix anthropic api claude-3-7 max output tokens

with beta header this is 128k

Resolves https://github.com/BerriAI/litellm/issues/8964

* feat(handler.py): handle new anthropic 'thinking_delta' block on streaming

Fixes https://github.com/BerriAI/litellm/issues/8825
This commit is contained in:
Krish Dholakia 2025-03-05 22:31:39 -08:00 committed by GitHub
parent f6535ae6ad
commit 744e10b0f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 26 additions and 11 deletions

View file

@ -474,7 +474,10 @@ class ModelResponseIterator:
if len(self.content_blocks) == 0:
return False
if self.content_blocks[0]["delta"]["type"] == "text_delta":
if (
self.content_blocks[0]["delta"]["type"] == "text_delta"
or self.content_blocks[0]["delta"]["type"] == "thinking_delta"
):
return False
for block in self.content_blocks:
@ -617,9 +620,11 @@ class ModelResponseIterator:
"index": self.tool_index,
}
elif type_chunk == "content_block_stop":
ContentBlockStop(**chunk) # type: ignore
# check if tool call content block
is_empty = self.check_empty_tool_call_args()
if is_empty:
tool_use = {
"id": None,

View file

@ -1260,6 +1260,9 @@ class AWSEventStreamDecoder:
_thinking_block = ChatCompletionThinkingBlock(type="thinking")
if "text" in thinking_block:
_thinking_block["thinking"] = thinking_block["text"]
elif "signature" in thinking_block:
_thinking_block["signature"] = thinking_block["signature"]
_thinking_block["thinking"] = "" # consistent with anthropic response
thinking_blocks_list.append(_thinking_block)
return thinking_blocks_list
@ -1322,6 +1325,12 @@ class AWSEventStreamDecoder:
thinking_blocks = self.translate_thinking_blocks(
delta_obj["reasoningContent"]
)
if (
thinking_blocks
and len(thinking_blocks) > 0
and reasoning_content is None
):
reasoning_content = "" # set to non-empty string to ensure consistency with Anthropic
elif (
"contentBlockIndex" in chunk_data
): # stop block, no 'start' or 'delta' object

View file

@ -2913,9 +2913,9 @@
"supports_tool_choice": true
},
"claude-3-7-sonnet-latest": {
"max_tokens": 8192,
"max_tokens": 128000,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_output_tokens": 128000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
@ -2932,9 +2932,9 @@
"supports_tool_choice": true
},
"claude-3-7-sonnet-20250219": {
"max_tokens": 8192,
"max_tokens": 128000,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_output_tokens": 128000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,

View file

@ -2913,9 +2913,9 @@
"supports_tool_choice": true
},
"claude-3-7-sonnet-latest": {
"max_tokens": 8192,
"max_tokens": 128000,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_output_tokens": 128000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
@ -2932,9 +2932,9 @@
"supports_tool_choice": true
},
"claude-3-7-sonnet-20250219": {
"max_tokens": 8192,
"max_tokens": 128000,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"max_output_tokens": 128000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,

View file

@ -1196,10 +1196,11 @@ def test_anthropic_thinking_output(model):
[
"anthropic/claude-3-7-sonnet-20250219",
# "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
# "bedrock/invoke/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
],
)
def test_anthropic_thinking_output_stream(model):
# litellm.set_verbose = True
litellm.set_verbose = True
try:
# litellm._turn_on_debug()
resp = litellm.completion(
@ -1207,7 +1208,7 @@ def test_anthropic_thinking_output_stream(model):
messages=[{"role": "user", "content": "Tell me a joke."}],
stream=True,
thinking={"type": "enabled", "budget_tokens": 1024},
timeout=5,
timeout=10,
)
reasoning_content_exists = False