[Feat] - Display thinking tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) (#9029)

* if merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * stash changes * working merge_reasoning_content_in_choices with bedrock * fix litellm_params accessor * fix streaming handler * merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * test_bedrock_stream_thinking_content_openwebui * merge_reasoning_content_in_choices * fix for _optional_combine_thinking_block_in_choices * linting error fix
2025-04-26 19:24:27 +00:00 · 2025-03-06 18:32:58 -08:00 · 2025-03-06 18:32:58 -08:00 · bfbbac38fc
commit bfbbac38fc
parent d5c20188d7
9 changed files with 358 additions and 29 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@ -57,6 +57,7 @@ def get_litellm_params(
    prompt_variables: Optional[dict] = None,
    async_call: Optional[bool] = None,
    ssl_verify: Optional[bool] = None,
    merge_reasoning_content_in_choices: Optional[bool] = None,
    **kwargs,
 ) -> dict:
    litellm_params = {
@ -97,5 +98,6 @@ def get_litellm_params(
        "prompt_variables": prompt_variables,
        "async_call": async_call,
        "ssl_verify": ssl_verify,
        "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
    }
    return litellm_params
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -15,6 +15,7 @@ from litellm import verbose_logger
 from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.types.llms.openai import ChatCompletionChunk
 from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import Delta
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import (
@ -70,6 +71,17 @@ class CustomStreamWrapper:
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        self.sent_last_chunk = False
        litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
            **self.logging_obj.model_call_details.get("litellm_params", {})
        )
        self.merge_reasoning_content_in_choices: bool = (
            litellm_params.merge_reasoning_content_in_choices or False
        )
        self.sent_first_thinking_block = False
        self.sent_last_thinking_block = False
        self.thinking_content = ""
        self.system_fingerprint: Optional[str] = None
        self.received_finish_reason: Optional[str] = None
        self.intermittent_finish_reason: Optional[str] = (
@ -87,12 +99,7 @@ class CustomStreamWrapper:
        self.holding_chunk = ""
        self.complete_response = ""
        self.response_uptil_now = ""
-        _model_info = (
+        _model_info: Dict = litellm_params.model_info or {}
            self.logging_obj.model_call_details.get("litellm_params", {}).get(
                "model_info", {}
            )
            or {}
        )
        _api_base = get_api_base(
            model=model or "",
@ -873,6 +880,10 @@ class CustomStreamWrapper:
                    _index: Optional[int] = completion_obj.get("index")
                    if _index is not None:
                        model_response.choices[0].index = _index
                self._optional_combine_thinking_block_in_choices(
                    model_response=model_response
                )
                print_verbose(f"returning model_response: {model_response}")
                return model_response
            else:
@ -929,6 +940,48 @@ class CustomStreamWrapper:
                self.chunks.append(model_response)
            return
    def _optional_combine_thinking_block_in_choices(
        self, model_response: ModelResponseStream
    ) -> None:
        """
        UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
        In place updates the model_response object with reasoning_content in content with <think>...</think> tags
        Enabled when `merge_reasoning_content_in_choices=True` passed in request params
        """
        if self.merge_reasoning_content_in_choices is True:
            reasoning_content = getattr(
                model_response.choices[0].delta, "reasoning_content", None
            )
            if reasoning_content:
                if self.sent_first_thinking_block is False:
                    model_response.choices[0].delta.content += (
                        "<think>" + reasoning_content
                    )
                    self.sent_first_thinking_block = True
                elif (
                    self.sent_first_thinking_block is True
                    and hasattr(model_response.choices[0].delta, "reasoning_content")
                    and model_response.choices[0].delta.reasoning_content
                ):
                    model_response.choices[0].delta.content = reasoning_content
            elif (
                self.sent_first_thinking_block is True
                and not self.sent_last_thinking_block
                and model_response.choices[0].delta.content
            ):
                model_response.choices[0].delta.content = (
                    "</think>" + model_response.choices[0].delta.content
                )
                self.sent_last_thinking_block = True
            if hasattr(model_response.choices[0].delta, "reasoning_content"):
                del model_response.choices[0].delta.reasoning_content
        return
    def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
        model_response = self.model_response_creator()
        response_obj: Dict[str, Any] = {}
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1159,6 +1159,9 @@ def completion(  # type: ignore # noqa: PLR0915
            prompt_id=prompt_id,
            prompt_variables=prompt_variables,
            ssl_verify=ssl_verify,
            merge_reasoning_content_in_choices=kwargs.get(
                "merge_reasoning_content_in_choices", None
            ),
        )
        logging.update_environment_variables(
            model=model,
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,27 +1,13 @@
 model_list:
-  - model_name: fake-openai-endpoint
+  - model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
    litellm_params:
-      model: openai/my-fake-model
+
-      api_key: my-fake-key
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+      thinking: {"type": "enabled", "budget_tokens": 1024}
-  - model_name: claude-special-alias
+      max_tokens: 1080
-    litellm_params:
+      merge_reasoning_content_in_choices: true
-      model: anthropic/claude-3-haiku-20240307
+
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-5-sonnet-20241022
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: claude-3-7-sonnet-20250219
    litellm_params:
      model: anthropic/claude-3-7-sonnet-20250219
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: anthropic/*
    litellm_params:
      model: anthropic/*
      api_key: os.environ/ANTHROPIC_API_KEY
 general_settings:
  store_model_in_db: true
  store_prompts_in_spend_logs: true
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
    budget_duration: Optional[str] = None
    use_in_pass_through: Optional[bool] = False
    model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
    merge_reasoning_content_in_choices: Optional[bool] = False
    model_info: Optional[Dict] = None
    def __init__(
        self,
@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
        budget_duration: Optional[str] = None,
        # Pass through params
        use_in_pass_through: Optional[bool] = False,
        # This will merge the reasoning content in the choices
        merge_reasoning_content_in_choices: Optional[bool] = False,
        model_info: Optional[Dict] = None,
        **params,
    ):
        args = locals()
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
 from typing_extensions import Callable, Dict, Required, TypedDict, override
 import litellm
 from ..litellm_core_utils.core_helpers import map_finish_reason
 from .guardrails import GuardrailEventHooks
 from .llms.openai import (
@ -1803,6 +1805,7 @@ all_litellm_params = [
    "max_budget",
    "budget_duration",
    "use_in_pass_through",
    "merge_reasoning_content_in_choices",
 ] + list(StandardCallbackDynamicParams.__annotations__.keys())
--- a/tests/litellm/litellm_core_utils/test_streaming_handler.py
+++ b/tests/litellm/litellm_core_utils/test_streaming_handler.py
@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
        model_response=ModelResponseStream(**chunk),
        response_obj=MagicMock(),
    )
 def test_optional_combine_thinking_block_in_choices(
    initialized_custom_stream_wrapper: CustomStreamWrapper,
 ):
    """Test that reasoning_content is properly combined with content using <think> tags"""
    # Setup the wrapper to use the merge feature
    initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
    # First chunk with reasoning_content - should add <think> tag
    first_chunk = {
        "id": "chunk1",
        "object": "chat.completion.chunk",
        "created": 1741037890,
        "model": "deepseek-reasoner",
        "choices": [
            {
                "index": 0,
                "delta": {
                    "content": "",
                    "reasoning_content": "Let me think about this",
                },
                "finish_reason": None,
            }
        ],
    }
    # Middle chunk with more reasoning_content
    middle_chunk = {
        "id": "chunk2",
        "object": "chat.completion.chunk",
        "created": 1741037891,
        "model": "deepseek-reasoner",
        "choices": [
            {
                "index": 0,
                "delta": {"content": "", "reasoning_content": " step by step"},
                "finish_reason": None,
            }
        ],
    }
    # Final chunk with actual content - should add </think> tag
    final_chunk = {
        "id": "chunk3",
        "object": "chat.completion.chunk",
        "created": 1741037892,
        "model": "deepseek-reasoner",
        "choices": [
            {
                "index": 0,
                "delta": {"content": "The answer is 42", "reasoning_content": None},
                "finish_reason": None,
            }
        ],
    }
    # Process first chunk
    first_response = ModelResponseStream(**first_chunk)
    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
        first_response
    )
    print("first_response", json.dumps(first_response, indent=4, default=str))
    assert first_response.choices[0].delta.content == "<think>Let me think about this"
    # assert the response does not have attribute reasoning_content
    assert not hasattr(first_response.choices[0].delta, "reasoning_content")
    assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
    # Process middle chunk
    middle_response = ModelResponseStream(**middle_chunk)
    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
        middle_response
    )
    print("middle_response", json.dumps(middle_response, indent=4, default=str))
    assert middle_response.choices[0].delta.content == " step by step"
    assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
    # Process final chunk
    final_response = ModelResponseStream(**final_chunk)
    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
        final_response
    )
    print("final_response", json.dumps(final_response, indent=4, default=str))
    assert final_response.choices[0].delta.content == "</think>The answer is 42"
    assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
    assert not hasattr(final_response.choices[0].delta, "reasoning_content")
 def test_multi_chunk_reasoning_and_content(
    initialized_custom_stream_wrapper: CustomStreamWrapper,
 ):
    """Test handling of multiple reasoning chunks followed by multiple content chunks"""
    # Setup the wrapper to use the merge feature
    initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
    initialized_custom_stream_wrapper.sent_first_thinking_block = False
    initialized_custom_stream_wrapper.sent_last_thinking_block = False
    # Create test chunks
    chunks = [
        # Chunk 1: First reasoning chunk
        {
            "id": "chunk1",
            "object": "chat.completion.chunk",
            "created": 1741037890,
            "model": "deepseek-reasoner",
            "choices": [
                {
                    "index": 0,
                    "delta": {
                        "content": "",
                        "reasoning_content": "To solve this problem",
                    },
                    "finish_reason": None,
                }
            ],
        },
        # Chunk 2: Second reasoning chunk
        {
            "id": "chunk2",
            "object": "chat.completion.chunk",
            "created": 1741037891,
            "model": "deepseek-reasoner",
            "choices": [
                {
                    "index": 0,
                    "delta": {
                        "content": "",
                        "reasoning_content": ", I need to calculate 6 * 7",
                    },
                    "finish_reason": None,
                }
            ],
        },
        # Chunk 3: Third reasoning chunk
        {
            "id": "chunk3",
            "object": "chat.completion.chunk",
            "created": 1741037892,
            "model": "deepseek-reasoner",
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": "", "reasoning_content": " which equals 42"},
                    "finish_reason": None,
                }
            ],
        },
        # Chunk 4: First content chunk (transition from reasoning to content)
        {
            "id": "chunk4",
            "object": "chat.completion.chunk",
            "created": 1741037893,
            "model": "deepseek-reasoner",
            "choices": [
                {
                    "index": 0,
                    "delta": {
                        "content": "The answer to your question",
                        "reasoning_content": None,
                    },
                    "finish_reason": None,
                }
            ],
        },
        # Chunk 5: Second content chunk
        {
            "id": "chunk5",
            "object": "chat.completion.chunk",
            "created": 1741037894,
            "model": "deepseek-reasoner",
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": " is 42.", "reasoning_content": None},
                    "finish_reason": None,
                }
            ],
        },
    ]
    # Expected content after processing each chunk
    expected_contents = [
        "<think>To solve this problem",
        ", I need to calculate 6 * 7",
        " which equals 42",
        "</think>The answer to your question",
        " is 42.",
    ]
    # Process each chunk and verify results
    for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
        response = ModelResponseStream(**chunk)
        initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
            response
        )
        # Check content
        assert (
            response.choices[0].delta.content == expected_content
        ), f"Chunk {i+1}: content mismatch"
        # Check reasoning_content was removed
        assert not hasattr(
            response.choices[0].delta, "reasoning_content"
        ), f"Chunk {i+1}: reasoning_content should be removed"
    # Verify final state
    assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
    assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
            "Alright, let's get started with resolving this issue about implementing"
            in json_data
        )
@pytest.mark.asyncio
 async def test_bedrock_stream_thinking_content_openwebui():
    """
    When merge_reasoning_content_in_choices=True
    The content should be collected as
    ```
    <think>
    I am a helpful assistant, the user wants to know who I am
    </think>
    Hi I am Anthropic, I am a helpful assistant
    ```
    """
    response = await litellm.acompletion(
        model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
        messages=[{"role": "user", "content": "Hello who is this?"}],
        stream=True,
        max_tokens=1080,
        thinking={"type": "enabled", "budget_tokens": 1024},
        merge_reasoning_content_in_choices=True,
    )
    content = ""
    async for chunk in response:
        content += chunk.choices[0].delta.content or ""
        # OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
        assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
        print(chunk)
    print("collected content", content)
    # Assert that the content follows the expected format with exactly one thinking section
    think_open_pos = content.find("<think>")
    think_close_pos = content.find("</think>")
    # Assert there's exactly one opening and closing tag
    assert think_open_pos >= 0, "Opening <think> tag not found"
    assert think_close_pos > 0, "Closing </think> tag not found"
    assert (
        content.count("<think>") == 1
    ), "There should be exactly one opening <think> tag"
    assert (
        content.count("</think>") == 1
    ), "There should be exactly one closing </think> tag"
    # Assert the opening tag comes before the closing tag
    assert (
        think_open_pos < think_close_pos
    ), "Opening tag should come before closing tag"
    # Assert there's content between the tags
    thinking_content = content[think_open_pos + 7 : think_close_pos]
    assert (
        len(thinking_content.strip()) > 0
    ), "There should be content between thinking tags"
    # Assert there's content after the closing tag
    assert (
        len(content) > think_close_pos + 8
    ), "There should be content after the thinking tags"
    response_content = content[think_close_pos + 8 :].strip()
    assert (
        len(response_content) > 0
    ), "There should be non-empty content after thinking tags"