mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
Support caching on reasoning content + other fixes (#8973)
* fix(factory.py): pass on anthropic thinking content from assistant call * fix(factory.py): fix anthropic messages to handle thinking blocks Fixes https://github.com/BerriAI/litellm/issues/8961 * fix(factory.py): fix bedrock handling for assistant content in messages Fixes https://github.com/BerriAI/litellm/issues/8961 * feat(convert_dict_to_response.py): handle reasoning content + thinking blocks in chat completion block ensures caching works for anthropic thinking block * fix(convert_dict_to_response.py): pass all message params to delta block ensures streaming delta also contains the reasoning content / thinking block * test(test_prompt_factory.py): remove redundant test anthropic now supports assistant as the first message * fix(factory.py): fix linting errors * fix: fix code qa * test: remove falsy test * fix(litellm_logging.py): fix str conversion
This commit is contained in:
parent
4c8b4fefc9
commit
662c59adcf
11 changed files with 230 additions and 50 deletions
|
@ -247,7 +247,6 @@ class LLMCachingHandler:
|
|||
pass
|
||||
else:
|
||||
call_type = original_function.__name__
|
||||
|
||||
cached_result = self._convert_cached_result_to_model_response(
|
||||
cached_result=cached_result,
|
||||
call_type=call_type,
|
||||
|
@ -725,6 +724,7 @@ class LLMCachingHandler:
|
|||
"""
|
||||
Sync internal method to add the result to the cache
|
||||
"""
|
||||
|
||||
new_kwargs = kwargs.copy()
|
||||
new_kwargs.update(
|
||||
convert_args_to_kwargs(
|
||||
|
@ -738,6 +738,7 @@ class LLMCachingHandler:
|
|||
if self._should_store_result_in_cache(
|
||||
original_function=self.original_function, kwargs=new_kwargs
|
||||
):
|
||||
|
||||
litellm.cache.add_cache(result, **new_kwargs)
|
||||
|
||||
return
|
||||
|
|
|
@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
|
|||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
|
||||
from litellm.types.llms.openai import ChatCompletionThinkingBlock
|
||||
from litellm.types.utils import (
|
||||
ChatCompletionDeltaToolCall,
|
||||
ChatCompletionMessageToolCall,
|
||||
|
@ -128,12 +129,7 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
|
|||
model_response_object = ModelResponse(stream=True)
|
||||
choice_list = []
|
||||
for idx, choice in enumerate(response_object["choices"]):
|
||||
delta = Delta(
|
||||
content=choice["message"].get("content", None),
|
||||
role=choice["message"]["role"],
|
||||
function_call=choice["message"].get("function_call", None),
|
||||
tool_calls=choice["message"].get("tool_calls", None),
|
||||
)
|
||||
delta = Delta(**choice["message"])
|
||||
finish_reason = choice.get("finish_reason", None)
|
||||
if finish_reason is None:
|
||||
# gpt-4 vision can return 'finish_reason' or 'finish_details'
|
||||
|
@ -456,11 +452,20 @@ def convert_to_model_response_object( # noqa: PLR0915
|
|||
provider_specific_fields[field] = choice["message"][field]
|
||||
|
||||
# Handle reasoning models that display `reasoning_content` within `content`
|
||||
|
||||
if "reasoning_content" in choice["message"]:
|
||||
reasoning_content = choice["message"]["reasoning_content"]
|
||||
content = choice["message"]["content"]
|
||||
else:
|
||||
reasoning_content, content = _parse_content_for_reasoning(
|
||||
choice["message"].get("content")
|
||||
)
|
||||
|
||||
# Handle thinking models that display `thinking_blocks` within `content`
|
||||
thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
|
||||
if "thinking_blocks" in choice["message"]:
|
||||
thinking_blocks = choice["message"]["thinking_blocks"]
|
||||
provider_specific_fields["thinking_blocks"] = thinking_blocks
|
||||
|
||||
if reasoning_content:
|
||||
provider_specific_fields["reasoning_content"] = (
|
||||
reasoning_content
|
||||
|
@ -474,6 +479,7 @@ def convert_to_model_response_object( # noqa: PLR0915
|
|||
audio=choice["message"].get("audio", None),
|
||||
provider_specific_fields=provider_specific_fields,
|
||||
reasoning_content=reasoning_content,
|
||||
thinking_blocks=thinking_blocks,
|
||||
)
|
||||
finish_reason = choice.get("finish_reason", None)
|
||||
if finish_reason is None:
|
||||
|
|
|
@ -1282,6 +1282,7 @@ def add_cache_control_to_content(
|
|||
AnthropicMessagesImageParam,
|
||||
AnthropicMessagesTextParam,
|
||||
AnthropicMessagesDocumentParam,
|
||||
ChatCompletionThinkingBlock,
|
||||
],
|
||||
orignal_content_element: Union[dict, AllMessageValues],
|
||||
):
|
||||
|
@ -1454,12 +1455,23 @@ def anthropic_messages_pt( # noqa: PLR0915
|
|||
assistant_content_block["content"], list
|
||||
):
|
||||
for m in assistant_content_block["content"]:
|
||||
# handle text
|
||||
# handle thinking blocks
|
||||
thinking_block = cast(str, m.get("thinking", ""))
|
||||
text_block = cast(str, m.get("text", ""))
|
||||
if (
|
||||
m.get("type", "") == "text" and len(m.get("text", "")) > 0
|
||||
m.get("type", "") == "thinking" and len(thinking_block) > 0
|
||||
): # don't pass empty text blocks. anthropic api raises errors.
|
||||
anthropic_message: Union[
|
||||
ChatCompletionThinkingBlock,
|
||||
AnthropicMessagesTextParam,
|
||||
] = cast(ChatCompletionThinkingBlock, m)
|
||||
assistant_content.append(anthropic_message)
|
||||
# handle text
|
||||
elif (
|
||||
m.get("type", "") == "text" and len(text_block) > 0
|
||||
): # don't pass empty text blocks. anthropic api raises errors.
|
||||
anthropic_message = AnthropicMessagesTextParam(
|
||||
type="text", text=m.get("text")
|
||||
type="text", text=text_block
|
||||
)
|
||||
_cached_message = add_cache_control_to_content(
|
||||
anthropic_content_element=anthropic_message,
|
||||
|
@ -1512,6 +1524,7 @@ def anthropic_messages_pt( # noqa: PLR0915
|
|||
msg_i += 1
|
||||
|
||||
if assistant_content:
|
||||
|
||||
new_messages.append({"role": "assistant", "content": assistant_content})
|
||||
|
||||
if msg_i == init_msg_i: # prevent infinite loops
|
||||
|
@ -1520,17 +1533,6 @@ def anthropic_messages_pt( # noqa: PLR0915
|
|||
model=model,
|
||||
llm_provider=llm_provider,
|
||||
)
|
||||
if not new_messages or new_messages[0]["role"] != "user":
|
||||
if litellm.modify_params:
|
||||
new_messages.insert(
|
||||
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
|
||||
)
|
||||
else:
|
||||
raise Exception(
|
||||
"Invalid first message={}. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, ".format(
|
||||
new_messages
|
||||
)
|
||||
)
|
||||
|
||||
if new_messages[-1]["role"] == "assistant":
|
||||
if isinstance(new_messages[-1]["content"], str):
|
||||
|
@ -2924,7 +2926,14 @@ class BedrockConverseMessagesProcessor:
|
|||
assistants_parts: List[BedrockContentBlock] = []
|
||||
for element in _assistant_content:
|
||||
if isinstance(element, dict):
|
||||
if element["type"] == "text":
|
||||
if element["type"] == "thinking":
|
||||
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
|
||||
thinking_blocks=[
|
||||
cast(ChatCompletionThinkingBlock, element)
|
||||
]
|
||||
)
|
||||
assistants_parts.extend(thinking_block)
|
||||
elif element["type"] == "text":
|
||||
assistants_part = BedrockContentBlock(
|
||||
text=element["text"]
|
||||
)
|
||||
|
@ -3157,7 +3166,14 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
|
|||
assistants_parts: List[BedrockContentBlock] = []
|
||||
for element in _assistant_content:
|
||||
if isinstance(element, dict):
|
||||
if element["type"] == "text":
|
||||
if element["type"] == "thinking":
|
||||
thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
|
||||
thinking_blocks=[
|
||||
cast(ChatCompletionThinkingBlock, element)
|
||||
]
|
||||
)
|
||||
assistants_parts.extend(thinking_block)
|
||||
elif element["type"] == "text":
|
||||
assistants_part = BedrockContentBlock(text=element["text"])
|
||||
assistants_parts.append(assistants_part)
|
||||
elif element["type"] == "image_url":
|
||||
|
|
|
@ -361,6 +361,7 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
|
|||
type: Required[Literal["thinking"]]
|
||||
thinking: str
|
||||
signature_delta: str
|
||||
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
|
||||
|
||||
|
||||
class OpenAIChatCompletionTextObject(TypedDict):
|
||||
|
@ -449,7 +450,11 @@ class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
|
|||
|
||||
class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
|
||||
role: Required[Literal["assistant"]]
|
||||
content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
|
||||
content: Optional[
|
||||
Union[
|
||||
str, Iterable[Union[ChatCompletionTextObject, ChatCompletionThinkingBlock]]
|
||||
]
|
||||
]
|
||||
name: Optional[str]
|
||||
tool_calls: Optional[List[ChatCompletionAssistantToolCall]]
|
||||
function_call: Optional[ChatCompletionToolCallFunctionChunk]
|
||||
|
|
|
@ -1048,6 +1048,7 @@ def client(original_function): # noqa: PLR0915
|
|||
)
|
||||
|
||||
if caching_handler_response.cached_result is not None:
|
||||
verbose_logger.debug("Cache hit!")
|
||||
return caching_handler_response.cached_result
|
||||
|
||||
# CHECK MAX TOKENS
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -855,3 +855,51 @@ def test_convert_to_model_response_object_with_empty_str():
|
|||
resp: ModelResponse = convert_to_model_response_object(**args)
|
||||
assert resp is not None
|
||||
assert resp.choices[0].message.content is not None
|
||||
|
||||
|
||||
def test_convert_to_model_response_object_with_thinking_content():
|
||||
"""Test that convert_to_model_response_object handles thinking content correctly."""
|
||||
|
||||
args = {
|
||||
"response_object": {
|
||||
"id": "chatcmpl-8cc87354-70f3-4a14-b71b-332e965d98d2",
|
||||
"created": 1741057687,
|
||||
"model": "claude-3-7-sonnet-20250219",
|
||||
"object": "chat.completion",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"message": {
|
||||
"content": "# LiteLLM\n\nLiteLLM is an open-source library that provides a unified interface for working with various Large Language Models (LLMs). It acts as an abstraction layer that lets developers interact with multiple LLM providers through a single, consistent API.\n\n## Key features:\n\n- **Universal API**: Standardizes interactions with models from OpenAI, Anthropic, Cohere, Azure, and many other providers\n- **Simple switching**: Easily swap between different LLM providers without changing your code\n- **Routing capabilities**: Manage load balancing, fallbacks, and cost optimization\n- **Prompt templates**: Handle different model-specific prompt formats automatically\n- **Logging and observability**: Track usage, performance, and costs across providers\n\nLiteLLM is particularly useful for teams who want flexibility in their LLM infrastructure without creating custom integration code for each provider.",
|
||||
"role": "assistant",
|
||||
"tool_calls": None,
|
||||
"function_call": None,
|
||||
"reasoning_content": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
|
||||
"thinking_blocks": [
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
|
||||
"signature": "ErUBCkYIARgCIkCf+r0qMSOMYkjlFERM00IxsY9I/m19dQGEF/Zv1E0AtvdZjKGnr+nr5vXUldmb/sUCgrQRH4YUyV0X3MoMrsNnEgxDqhUFcUTg1vM0CroaDEY1wKJ0Ca0EZ6S1jCIwF8ATum3xiF/mRSIIjoD6Virh0hFcOfH3Sz6Chtev9WUwwYMAVP4/hyzbrUDnsUlmKh0CfTayaXm6o63/6Kelr6pzLbErjQx2xZRnRjCypw==",
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"completion_tokens": 460,
|
||||
"prompt_tokens": 65,
|
||||
"total_tokens": 525,
|
||||
"completion_tokens_details": None,
|
||||
"prompt_tokens_details": {"audio_tokens": None, "cached_tokens": 0},
|
||||
"cache_creation_input_tokens": 0,
|
||||
"cache_read_input_tokens": 0,
|
||||
},
|
||||
},
|
||||
"model_response_object": ModelResponse(),
|
||||
}
|
||||
|
||||
resp: ModelResponse = convert_to_model_response_object(**args)
|
||||
assert resp is not None
|
||||
assert resp.choices[0].message.reasoning_content is not None
|
||||
|
|
|
@ -125,28 +125,6 @@ def test_anthropic_pt_formatting():
|
|||
assert anthropic_pt(messages) == expected_prompt
|
||||
|
||||
|
||||
def test_anthropic_messages_pt():
|
||||
# Test case: No messages (filtered system messages only)
|
||||
litellm.modify_params = True
|
||||
messages = []
|
||||
expected_messages = [{"role": "user", "content": [{"type": "text", "text": "."}]}]
|
||||
assert (
|
||||
anthropic_messages_pt(
|
||||
messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
|
||||
)
|
||||
== expected_messages
|
||||
)
|
||||
|
||||
# Test case: No messages (filtered system messages only) when modify_params is False should raise error
|
||||
litellm.modify_params = False
|
||||
messages = []
|
||||
with pytest.raises(Exception) as err:
|
||||
anthropic_messages_pt(
|
||||
messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
|
||||
)
|
||||
assert "Invalid first message" in str(err.value)
|
||||
|
||||
|
||||
def test_anthropic_messages_nested_pt():
|
||||
from litellm.types.llms.anthropic import (
|
||||
AnthopicMessagesAssistantMessageParam,
|
||||
|
|
|
@ -2561,3 +2561,30 @@ def test_redis_caching_multiple_namespaces():
|
|||
|
||||
# request 4 without a namespace should not be cached under the same key as request 3
|
||||
assert response_4.id != response_3.id
|
||||
|
||||
|
||||
def test_caching_with_reasoning_content():
|
||||
"""
|
||||
Test that reasoning content is cached
|
||||
"""
|
||||
|
||||
import uuid
|
||||
|
||||
messages = [{"role": "user", "content": f"what is litellm? {uuid.uuid4()}"}]
|
||||
litellm.cache = Cache()
|
||||
|
||||
response_1 = completion(
|
||||
model="anthropic/claude-3-7-sonnet-latest",
|
||||
messages=messages,
|
||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||
)
|
||||
|
||||
response_2 = completion(
|
||||
model="anthropic/claude-3-7-sonnet-latest",
|
||||
messages=messages,
|
||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||
)
|
||||
|
||||
print(f"response 2: {response_2.model_dump_json(indent=4)}")
|
||||
assert response_2._hidden_params["cache_hit"] == True
|
||||
assert response_2.choices[0].message.reasoning_content is not None
|
||||
|
|
|
@ -257,6 +257,8 @@ def test_aaparallel_function_call_with_anthropic_thinking(model):
|
|||
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||
) # get a new response from the model where it can see the function response
|
||||
print("second response\n", second_response)
|
||||
|
||||
## THIRD RESPONSE
|
||||
except litellm.InternalServerError as e:
|
||||
print(e)
|
||||
except litellm.RateLimitError as e:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue