Support caching on reasoning content + other fixes (#8973)

* fix(factory.py): pass on anthropic thinking content from assistant call * fix(factory.py): fix anthropic messages to handle thinking blocks Fixes https://github.com/BerriAI/litellm/issues/8961 * fix(factory.py): fix bedrock handling for assistant content in messages Fixes https://github.com/BerriAI/litellm/issues/8961 * feat(convert_dict_to_response.py): handle reasoning content + thinking blocks in chat completion block ensures caching works for anthropic thinking block * fix(convert_dict_to_response.py): pass all message params to delta block ensures streaming delta also contains the reasoning content / thinking block * test(test_prompt_factory.py): remove redundant test anthropic now supports assistant as the first message * fix(factory.py): fix linting errors * fix: fix code qa * test: remove falsy test * fix(litellm_logging.py): fix str conversion
2025-04-25 18:54:30 +00:00 · 2025-03-04 21:12:16 -08:00 · 2025-03-04 21:12:16 -08:00 · 662c59adcf
commit 662c59adcf
parent 4c8b4fefc9
11 changed files with 230 additions and 50 deletions
--- a/litellm/caching/caching_handler.py
+++ b/litellm/caching/caching_handler.py
@ -247,7 +247,6 @@ class LLMCachingHandler:
                    pass
                else:
                    call_type = original_function.__name__
                    cached_result = self._convert_cached_result_to_model_response(
                        cached_result=cached_result,
                        call_type=call_type,
@ -725,6 +724,7 @@ class LLMCachingHandler:
        """
        Sync internal method to add the result to the cache
        """
        new_kwargs = kwargs.copy()
        new_kwargs.update(
            convert_args_to_kwargs(
@ -738,6 +738,7 @@ class LLMCachingHandler:
        if self._should_store_result_in_cache(
            original_function=self.original_function, kwargs=new_kwargs
        ):
            litellm.cache.add_cache(result, **new_kwargs)
        return
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
 import litellm
 from litellm._logging import verbose_logger
 from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
 from litellm.types.llms.openai import ChatCompletionThinkingBlock
 from litellm.types.utils import (
    ChatCompletionDeltaToolCall,
    ChatCompletionMessageToolCall,
@ -128,12 +129,7 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
    model_response_object = ModelResponse(stream=True)
    choice_list = []
    for idx, choice in enumerate(response_object["choices"]):
-        delta = Delta(
+        delta = Delta(**choice["message"])
            content=choice["message"].get("content", None),
            role=choice["message"]["role"],
            function_call=choice["message"].get("function_call", None),
            tool_calls=choice["message"].get("tool_calls", None),
        )
        finish_reason = choice.get("finish_reason", None)
        if finish_reason is None:
            # gpt-4 vision can return 'finish_reason' or 'finish_details'
@ -456,10 +452,19 @@ def convert_to_model_response_object(  # noqa: PLR0915
                            provider_specific_fields[field] = choice["message"][field]
                    # Handle reasoning models that display `reasoning_content` within `content`
                    if "reasoning_content" in choice["message"]:
                        reasoning_content = choice["message"]["reasoning_content"]
                        content = choice["message"]["content"]
                    else:
                        reasoning_content, content = _parse_content_for_reasoning(
                            choice["message"].get("content")
                        )
-                    reasoning_content, content = _parse_content_for_reasoning(
+                    # Handle thinking models that display `thinking_blocks` within `content`
-                        choice["message"].get("content")
+                    thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
-                    )
+                    if "thinking_blocks" in choice["message"]:
                        thinking_blocks = choice["message"]["thinking_blocks"]
                        provider_specific_fields["thinking_blocks"] = thinking_blocks
                    if reasoning_content:
                        provider_specific_fields["reasoning_content"] = (
@ -474,6 +479,7 @@ def convert_to_model_response_object(  # noqa: PLR0915
                        audio=choice["message"].get("audio", None),
                        provider_specific_fields=provider_specific_fields,
                        reasoning_content=reasoning_content,
                        thinking_blocks=thinking_blocks,
                    )
                    finish_reason = choice.get("finish_reason", None)
                if finish_reason is None:
--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -1282,6 +1282,7 @@ def add_cache_control_to_content(
        AnthropicMessagesImageParam,
        AnthropicMessagesTextParam,
        AnthropicMessagesDocumentParam,
        ChatCompletionThinkingBlock,
    ],
    orignal_content_element: Union[dict, AllMessageValues],
 ):
@ -1454,12 +1455,23 @@ def anthropic_messages_pt(  # noqa: PLR0915
                assistant_content_block["content"], list
            ):
                for m in assistant_content_block["content"]:
-                    # handle text
+                    # handle thinking blocks
                    thinking_block = cast(str, m.get("thinking", ""))
                    text_block = cast(str, m.get("text", ""))
                    if (
-                        m.get("type", "") == "text" and len(m.get("text", "")) > 0
+                        m.get("type", "") == "thinking" and len(thinking_block) > 0
                    ):  # don't pass empty text blocks. anthropic api raises errors.
                        anthropic_message: Union[
                            ChatCompletionThinkingBlock,
                            AnthropicMessagesTextParam,
                        ] = cast(ChatCompletionThinkingBlock, m)
                        assistant_content.append(anthropic_message)
                    # handle text
                    elif (
                        m.get("type", "") == "text" and len(text_block) > 0
                    ):  # don't pass empty text blocks. anthropic api raises errors.
                        anthropic_message = AnthropicMessagesTextParam(
-                            type="text", text=m.get("text")
+                            type="text", text=text_block
                        )
                        _cached_message = add_cache_control_to_content(
                            anthropic_content_element=anthropic_message,
@ -1512,6 +1524,7 @@ def anthropic_messages_pt(  # noqa: PLR0915
            msg_i += 1
        if assistant_content:
            new_messages.append({"role": "assistant", "content": assistant_content})
        if msg_i == init_msg_i:  # prevent infinite loops
@ -1520,17 +1533,6 @@ def anthropic_messages_pt(  # noqa: PLR0915
                model=model,
                llm_provider=llm_provider,
            )
    if not new_messages or new_messages[0]["role"] != "user":
        if litellm.modify_params:
            new_messages.insert(
                0, {"role": "user", "content": [{"type": "text", "text": "."}]}
            )
        else:
            raise Exception(
                "Invalid first message={}. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, ".format(
                    new_messages
                )
            )
    if new_messages[-1]["role"] == "assistant":
        if isinstance(new_messages[-1]["content"], str):
@ -2924,7 +2926,14 @@ class BedrockConverseMessagesProcessor:
                    assistants_parts: List[BedrockContentBlock] = []
                    for element in _assistant_content:
                        if isinstance(element, dict):
-                            if element["type"] == "text":
+                            if element["type"] == "thinking":
                                thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
                                    thinking_blocks=[
                                        cast(ChatCompletionThinkingBlock, element)
                                    ]
                                )
                                assistants_parts.extend(thinking_block)
                            elif element["type"] == "text":
                                assistants_part = BedrockContentBlock(
                                    text=element["text"]
                                )
@ -3157,7 +3166,14 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
                assistants_parts: List[BedrockContentBlock] = []
                for element in _assistant_content:
                    if isinstance(element, dict):
-                        if element["type"] == "text":
+                        if element["type"] == "thinking":
                            thinking_block = BedrockConverseMessagesProcessor.translate_thinking_blocks_to_reasoning_content_blocks(
                                thinking_blocks=[
                                    cast(ChatCompletionThinkingBlock, element)
                                ]
                            )
                            assistants_parts.extend(thinking_block)
                        elif element["type"] == "text":
                            assistants_part = BedrockContentBlock(text=element["text"])
                            assistants_parts.append(assistants_part)
                        elif element["type"] == "image_url":
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -361,6 +361,7 @@ class ChatCompletionThinkingBlock(TypedDict, total=False):
    type: Required[Literal["thinking"]]
    thinking: str
    signature_delta: str
    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 class OpenAIChatCompletionTextObject(TypedDict):
@ -449,7 +450,11 @@ class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
 class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
    role: Required[Literal["assistant"]]
-    content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
+    content: Optional[
        Union[
            str, Iterable[Union[ChatCompletionTextObject, ChatCompletionThinkingBlock]]
        ]
    ]
    name: Optional[str]
    tool_calls: Optional[List[ChatCompletionAssistantToolCall]]
    function_call: Optional[ChatCompletionToolCallFunctionChunk]
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1048,6 +1048,7 @@ def client(original_function):  # noqa: PLR0915
                )
                if caching_handler_response.cached_result is not None:
                    verbose_logger.debug("Cache hit!")
                    return caching_handler_response.cached_result
            # CHECK MAX TOKENS
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
--- a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
+++ b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
@ -855,3 +855,51 @@ def test_convert_to_model_response_object_with_empty_str():
    resp: ModelResponse = convert_to_model_response_object(**args)
    assert resp is not None
    assert resp.choices[0].message.content is not None
 def test_convert_to_model_response_object_with_thinking_content():
    """Test that convert_to_model_response_object handles thinking content correctly."""
    args = {
        "response_object": {
            "id": "chatcmpl-8cc87354-70f3-4a14-b71b-332e965d98d2",
            "created": 1741057687,
            "model": "claude-3-7-sonnet-20250219",
            "object": "chat.completion",
            "system_fingerprint": None,
            "choices": [
                {
                    "finish_reason": "stop",
                    "index": 0,
                    "message": {
                        "content": "# LiteLLM\n\nLiteLLM is an open-source library that provides a unified interface for working with various Large Language Models (LLMs). It acts as an abstraction layer that lets developers interact with multiple LLM providers through a single, consistent API.\n\n## Key features:\n\n- **Universal API**: Standardizes interactions with models from OpenAI, Anthropic, Cohere, Azure, and many other providers\n- **Simple switching**: Easily swap between different LLM providers without changing your code\n- **Routing capabilities**: Manage load balancing, fallbacks, and cost optimization\n- **Prompt templates**: Handle different model-specific prompt formats automatically\n- **Logging and observability**: Track usage, performance, and costs across providers\n\nLiteLLM is particularly useful for teams who want flexibility in their LLM infrastructure without creating custom integration code for each provider.",
                        "role": "assistant",
                        "tool_calls": None,
                        "function_call": None,
                        "reasoning_content": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
                        "thinking_blocks": [
                            {
                                "type": "thinking",
                                "thinking": "The person is asking about \"litellm\" and included what appears to be a UUID or some form of identifier at the end of their message (fffffe14-7991-43d0-acd8-d3e606db31a8).\n\nLiteLLM is an open-source library/project that provides a unified interface for working with various Large Language Models (LLMs). It's essentially a lightweight package that standardizes the way developers can work with different LLM APIs like OpenAI, Anthropic, Cohere, etc. through a consistent interface.\n\nSome key features and aspects of LiteLLM:\n\n1. Unified API for multiple LLM providers (OpenAI, Anthropic, Azure, etc.)\n2. Standardized input/output formats\n3. Handles routing, fallbacks, and load balancing\n4. Provides logging and observability\n5. Can help with cost tracking across different providers\n6. Makes it easier to switch between different LLM providers\n\nThe UUID-like string they included doesn't seem directly related to the question, unless it's some form of identifier they're including for tracking purposes.",
                                "signature": "ErUBCkYIARgCIkCf+r0qMSOMYkjlFERM00IxsY9I/m19dQGEF/Zv1E0AtvdZjKGnr+nr5vXUldmb/sUCgrQRH4YUyV0X3MoMrsNnEgxDqhUFcUTg1vM0CroaDEY1wKJ0Ca0EZ6S1jCIwF8ATum3xiF/mRSIIjoD6Virh0hFcOfH3Sz6Chtev9WUwwYMAVP4/hyzbrUDnsUlmKh0CfTayaXm6o63/6Kelr6pzLbErjQx2xZRnRjCypw==",
                            }
                        ],
                    },
                }
            ],
            "usage": {
                "completion_tokens": 460,
                "prompt_tokens": 65,
                "total_tokens": 525,
                "completion_tokens_details": None,
                "prompt_tokens_details": {"audio_tokens": None, "cached_tokens": 0},
                "cache_creation_input_tokens": 0,
                "cache_read_input_tokens": 0,
            },
        },
        "model_response_object": ModelResponse(),
    }
    resp: ModelResponse = convert_to_model_response_object(**args)
    assert resp is not None
    assert resp.choices[0].message.reasoning_content is not None
--- a/tests/llm_translation/test_prompt_factory.py
+++ b/tests/llm_translation/test_prompt_factory.py
@ -125,28 +125,6 @@ def test_anthropic_pt_formatting():
    assert anthropic_pt(messages) == expected_prompt
 def test_anthropic_messages_pt():
    # Test case: No messages (filtered system messages only)
    litellm.modify_params = True
    messages = []
    expected_messages = [{"role": "user", "content": [{"type": "text", "text": "."}]}]
    assert (
        anthropic_messages_pt(
            messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
        )
        == expected_messages
    )
    # Test case: No messages (filtered system messages only) when modify_params is False should raise error
    litellm.modify_params = False
    messages = []
    with pytest.raises(Exception) as err:
        anthropic_messages_pt(
            messages, model="claude-3-sonnet-20240229", llm_provider="anthropic"
        )
    assert "Invalid first message" in str(err.value)
 def test_anthropic_messages_nested_pt():
    from litellm.types.llms.anthropic import (
        AnthopicMessagesAssistantMessageParam,
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -2561,3 +2561,30 @@ def test_redis_caching_multiple_namespaces():
    # request 4 without a namespace should not be cached under the same key as request 3
    assert response_4.id != response_3.id
 def test_caching_with_reasoning_content():
    """
    Test that reasoning content is cached
    """
    import uuid
    messages = [{"role": "user", "content": f"what is litellm? {uuid.uuid4()}"}]
    litellm.cache = Cache()
    response_1 = completion(
        model="anthropic/claude-3-7-sonnet-latest",
        messages=messages,
        thinking={"type": "enabled", "budget_tokens": 1024},
    )
    response_2 = completion(
        model="anthropic/claude-3-7-sonnet-latest",
        messages=messages,
        thinking={"type": "enabled", "budget_tokens": 1024},
    )
    print(f"response 2: {response_2.model_dump_json(indent=4)}")
    assert response_2._hidden_params["cache_hit"] == True
    assert response_2.choices[0].message.reasoning_content is not None
--- a/tests/local_testing/test_function_calling.py
+++ b/tests/local_testing/test_function_calling.py
@ -257,6 +257,8 @@ def test_aaparallel_function_call_with_anthropic_thinking(model):
                thinking={"type": "enabled", "budget_tokens": 1024},
            )  # get a new response from the model where it can see the function response
            print("second response\n", second_response)
            ## THIRD RESPONSE
    except litellm.InternalServerError as e:
        print(e)
    except litellm.RateLimitError as e: