Support caching on reasoning content + other fixes (#8973)

* fix(factory.py): pass on anthropic thinking content from assistant call

* fix(factory.py): fix anthropic messages to handle thinking blocks

Fixes https://github.com/BerriAI/litellm/issues/8961

* fix(factory.py): fix bedrock handling for assistant content in messages

Fixes https://github.com/BerriAI/litellm/issues/8961

* feat(convert_dict_to_response.py): handle reasoning content + thinking blocks in chat completion block

ensures caching works for anthropic thinking block

* fix(convert_dict_to_response.py): pass all message params to delta block

ensures streaming delta also contains the reasoning content / thinking block

* test(test_prompt_factory.py): remove redundant test

anthropic now supports assistant as the first message

* fix(factory.py): fix linting errors

* fix: fix code qa

* test: remove falsy test

* fix(litellm_logging.py): fix str conversion
This commit is contained in:
Krish Dholakia 2025-03-04 21:12:16 -08:00 committed by GitHub
parent 4c8b4fefc9
commit 662c59adcf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 230 additions and 50 deletions

View file

@ -247,7 +247,6 @@ class LLMCachingHandler:
pass
else:
call_type = original_function.__name__
cached_result = self._convert_cached_result_to_model_response(
cached_result=cached_result,
call_type=call_type,
@ -725,6 +724,7 @@ class LLMCachingHandler:
"""
Sync internal method to add the result to the cache
"""
new_kwargs = kwargs.copy()
new_kwargs.update(
convert_args_to_kwargs(
@ -738,6 +738,7 @@ class LLMCachingHandler:
if self._should_store_result_in_cache(
original_function=self.original_function, kwargs=new_kwargs
):
litellm.cache.add_cache(result, **new_kwargs)
return

View file

@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
import litellm
from litellm._logging import verbose_logger
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.types.llms.openai import ChatCompletionThinkingBlock
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall,
@ -128,12 +129,7 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
model_response_object = ModelResponse(stream=True)
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
delta = Delta(**choice["message"])
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
@ -456,10 +452,19 @@ def convert_to_model_response_object( # noqa: PLR0915
provider_specific_fields[field] = choice["message"][field]
# Handle reasoning models that display `reasoning_content` within `content`
if "reasoning_content" in choice["message"]:
reasoning_content = choice["message"]["reasoning_content"]
content = choice["message"]["content"]
else:
reasoning_content, content = _parse_content_for_reasoning(
choice["message"].get("content")
)
reasoning_content, content = _parse_content_for_reasoning(
choice["message"].get("content")
)
# Handle thinking models that display `thinking_blocks` within `content`
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
if "thinking_blocks" in choice["message"]:
thinking_blocks = choice["message"]["thinking_blocks"]
provider_specific_fields["thinking_blocks"] = thinking_blocks
if reasoning_content:
provider_specific_fields["reasoning_content"] = (
@ -474,6 +479,7 @@ def convert_to_model_response_object( # noqa: PLR0915
audio=choice["message"].get("audio", None),
provider_specific_fields=provider_specific_fields,
reasoning_content=reasoning_content,
thinking_blocks=thinking_blocks,
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:

View file

@ -1282,6 +1282,7 @@ def add_cache_control_to_content(
AnthropicMessagesImageParam,
AnthropicMessagesTextParam,
AnthropicMessagesDocumentParam,
ChatCompletionThinkingBlock,
],
orignal_content_element: Union[dict, AllMessageValues],
):
@ -1454,12 +1455,23 @@ def anthropic_messages_pt( # noqa: PLR0915
assistant_content_block["content"], list
):
for m in assistant_content_block["content"]:
# handle text
# handle thinking blocks
thinking_block = cast(str, m.get("thinking", ""))
text_block = cast(str, m.get("text", ""))
if (
m.get("type", "") == "text" and len(m.get("text", "")) > 0
m.get("type", "") == "thinking" and len(thinking_block) > 0
): # don't pass empty text blocks. anthropic api raises errors.
anthropic_message: Union[
ChatCompletionThinkingBlock,
AnthropicMessagesTextParam,
] = cast(ChatCompletionThinkingBlock, m)
assistant_content.append(anthropic_message)
# handle text
elif (
m.get("type", "") == "text" and len(text_block) > 0
): # don't pass empty text blocks. anthropic api raises errors.
anthropic_message = AnthropicMessagesTextParam(
type="text", text=m.get("text")
type="text", text=text_block
)
_cached_message = add_cache_control_to_content(
anthropic_content_element=anthropic_message,
@ -1512,6 +1524,7 @@ def anthropic_messages_pt( # noqa: PLR0915
msg_i += 1
if assistant_content:
new_messages.append({"role": "assistant", "content": assistant_content})
if msg_i == init_msg_i: # prevent infinite loops
@ -1520,17 +1533,6 @@ def anthropic_messages_pt( # noqa: PLR0915
model=model,
llm_provider=llm_provider,
)
if not new_messages or new_messages[0]["role"] != "user":
if litellm.modify_params:
new_messages.insert(
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
)
else:
raise Exception(
"Invalid first message={}. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, ".format(
new_messages
)
)
if new_messages[-1]["role"] == "assistant":
if isinstance(new_messages[-1]["content"], str):
@ -2924,7 +2926,14 @@ class BedrockConverseMessagesProcessor:
assistants_parts: List[BedrockContentBlock] = []
for element in _assistant_content:
if isinstance(element, dict):
if element["type"] == "text":
if element["type"] == "thinking":
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks=[
cast(ChatCompletionThinkingBlock, element)
]
)
assistants_parts.extend(thinking_block)
elif element["type"] == "text":
assistants_part = BedrockContentBlock(
text=element["text"]
)
@ -3157,7 +3166,14 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
assistants_parts: List[BedrockContentBlock] = []
for element in _assistant_content:
if isinstance(element, dict):
if element["type"] == "text":
if element["type"] == "thinking":
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
thinking_blocks=[
cast(ChatCompletionThinkingBlock, element)
]
)
assistants_parts.extend(thinking_block)
elif element["type"] == "text":
assistants_part = BedrockContentBlock(text=element["text"])
assistants_parts.append(assistants_part)
elif element["type"] == "image_url":

View file

@ -361,6 +361,7 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
type: Required[Literal["thinking"]]
thinking: str
signature_delta: str
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class OpenAIChatCompletionTextObject(TypedDict):
@ -449,7 +450,11 @@ class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
role: Required[Literal["assistant"]]
content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
content: Optional[
Union[
str, Iterable[Union[ChatCompletionTextObject, ChatCompletionThinkingBlock]]
]
]
name: Optional[str]
tool_calls: Optional[List[ChatCompletionAssistantToolCall]]
function_call: Optional[ChatCompletionToolCallFunctionChunk]

View file

@ -1048,6 +1048,7 @@ def client(original_function): # noqa: PLR0915
)
if caching_handler_response.cached_result is not None:
verbose_logger.debug("Cache hit!")
return caching_handler_response.cached_result
# CHECK MAX TOKENS

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -855,3 +855,51 @@ def test_convert_to_model_response_object_with_empty_str():
resp: ModelResponse = convert_to_model_response_object(**args)
assert resp is not None
assert resp.choices[0].message.content is not None
def test_convert_to_model_response_object_with_thinking_content():
"""Test that convert_to_model_response_object handles thinking content correctly."""
args = {
"response_object": {
"id": "chatcmpl-8cc87354-70f3-4a14-b71b-332e965d98d2",
"created": 1741057687,
"model": "claude-3-7-sonnet-20250219",
"object": "chat.completion",
"system_fingerprint": None,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "# LiteLLM\n\nLiteLLM is an open-source library that provides a unified interface for working with various Large Language Models (LLMs). It acts as an abstraction layer that lets developers interact with multiple LLM providers through a single, consistent API.\n\n## Key features:\n\n- **Universal API**: Standardizes interactions with models from OpenAI, Anthropic, Cohere, Azure, and many other providers\n- **Simple switching**: Easily swap between different LLM providers without changing your code\n- **Routing capabilities**: Manage load balancing, fallbacks, and cost optimization\n- **Prompt templates**: Handle different model-specific prompt formats automatically\n- **Logging and observability**: Track usage, performance, and costs across providers\n\nLiteLLM is particularly useful for teams who want flexibility in their LLM infrastructure without creating custom integration code for each provider.",
"role": "assistant",
"tool_calls": None,
"function_call": None,
"reasoning_content": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
"thinking_blocks": [
{
"type": "thinking",
"thinking": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
"signature": "ErUBCkYIARgCIkCf+r0qMSOMYkjlFERM00IxsY9I/m19dQGEF/Zv1E0AtvdZjKGnr+nr5vXUldmb/sUCgrQRH4YUyV0X3MoMrsNnEgxDqhUFcUTg1vM0CroaDEY1wKJ0Ca0EZ6S1jCIwF8ATum3xiF/mRSIIjoD6Virh0hFcOfH3Sz6Chtev9WUwwYMAVP4/hyzbrUDnsUlmKh0CfTayaXm6o63/6Kelr6pzLbErjQx2xZRnRjCypw==",
}
],
},
}
],
"usage": {
"completion_tokens": 460,
"prompt_tokens": 65,
"total_tokens": 525,
"completion_tokens_details": None,
"prompt_tokens_details": {"audio_tokens": None, "cached_tokens": 0},
"cache_creation_input_tokens": 0,
"cache_read_input_tokens": 0,
},
},
"model_response_object": ModelResponse(),
}
resp: ModelResponse = convert_to_model_response_object(**args)
assert resp is not None
assert resp.choices[0].message.reasoning_content is not None

View file

@ -125,28 +125,6 @@ def test_anthropic_pt_formatting():
assert anthropic_pt(messages) == expected_prompt
def test_anthropic_messages_pt():
# Test case: No messages (filtered system messages only)
litellm.modify_params = True
messages = []
expected_messages = [{"role": "user", "content": [{"type": "text", "text": "."}]}]
assert (
anthropic_messages_pt(
messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
)
== expected_messages
)
# Test case: No messages (filtered system messages only) when modify_params is False should raise error
litellm.modify_params = False
messages = []
with pytest.raises(Exception) as err:
anthropic_messages_pt(
messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
)
assert "Invalid first message" in str(err.value)
def test_anthropic_messages_nested_pt():
from litellm.types.llms.anthropic import (
AnthopicMessagesAssistantMessageParam,

View file

@ -2561,3 +2561,30 @@ def test_redis_caching_multiple_namespaces():
# request 4 without a namespace should not be cached under the same key as request 3
assert response_4.id != response_3.id
def test_caching_with_reasoning_content():
"""
Test that reasoning content is cached
"""
import uuid
messages = [{"role": "user", "content": f"what is litellm? {uuid.uuid4()}"}]
litellm.cache = Cache()
response_1 = completion(
model="anthropic/claude-3-7-sonnet-latest",
messages=messages,
thinking={"type": "enabled", "budget_tokens": 1024},
)
response_2 = completion(
model="anthropic/claude-3-7-sonnet-latest",
messages=messages,
thinking={"type": "enabled", "budget_tokens": 1024},
)
print(f"response 2: {response_2.model_dump_json(indent=4)}")
assert response_2._hidden_params["cache_hit"] == True
assert response_2.choices[0].message.reasoning_content is not None

View file

@ -257,6 +257,8 @@ def test_aaparallel_function_call_with_anthropic_thinking(model):
thinking={"type": "enabled", "budget_tokens": 1024},
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
## THIRD RESPONSE
except litellm.InternalServerError as e:
print(e)
except litellm.RateLimitError as e: