[Feat] - Display thinking tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) (#9029)

* if merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * stash changes * working merge_reasoning_content_in_choices with bedrock * fix litellm_params accessor * fix streaming handler * merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * test_bedrock_stream_thinking_content_openwebui * merge_reasoning_content_in_choices * fix for _optional_combine_thinking_block_in_choices * linting error fix
2025-04-25 02:34:29 +00:00 · 2025-03-06 18:32:58 -08:00 · 2025-03-06 18:32:58 -08:00 · b02af305de
commit b02af305de
parent 85d1427710
9 changed files with 358 additions and 29 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
 custom_prometheus_metadata_labels: List[str] = []
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
-
-
 force_ipv4: bool = (
    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
 )
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@ -57,6 +57,7 @@ def get_litellm_params(
    prompt_variables: Optional[dict] = None,
    async_call: Optional[bool] = None,
    ssl_verify: Optional[bool] = None,
+    merge_reasoning_content_in_choices: Optional[bool] = None,
    **kwargs,
 ) -> dict:
    litellm_params = {
@ -97,5 +98,6 @@ def get_litellm_params(
        "prompt_variables": prompt_variables,
        "async_call": async_call,
        "ssl_verify": ssl_verify,
+        "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
    }
    return litellm_params
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -15,6 +15,7 @@ from litellm import verbose_logger
 from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
 from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.types.llms.openai import ChatCompletionChunk
+from litellm.types.router import GenericLiteLLMParams
 from litellm.types.utils import Delta
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import (
@ -70,6 +71,17 @@ class CustomStreamWrapper:
        self.completion_stream = completion_stream
        self.sent_first_chunk = False
        self.sent_last_chunk = False
+
+        litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
+            **self.logging_obj.model_call_details.get("litellm_params", {})
+        )
+        self.merge_reasoning_content_in_choices: bool = (
+            litellm_params.merge_reasoning_content_in_choices or False
+        )
+        self.sent_first_thinking_block = False
+        self.sent_last_thinking_block = False
+        self.thinking_content = ""
+
        self.system_fingerprint: Optional[str] = None
        self.received_finish_reason: Optional[str] = None
        self.intermittent_finish_reason: Optional[str] = (
@ -87,12 +99,7 @@ class CustomStreamWrapper:
        self.holding_chunk = ""
        self.complete_response = ""
        self.response_uptil_now = ""
-        _model_info = (
-            self.logging_obj.model_call_details.get("litellm_params", {}).get(
-                "model_info", {}
-            )
-            or {}
-        )
+        _model_info: Dict = litellm_params.model_info or {}

        _api_base = get_api_base(
            model=model or "",
@ -873,6 +880,10 @@ class CustomStreamWrapper:
                    _index: Optional[int] = completion_obj.get("index")
                    if _index is not None:
                        model_response.choices[0].index = _index
+
+                self._optional_combine_thinking_block_in_choices(
+                    model_response=model_response
+                )
                print_verbose(f"returning model_response: {model_response}")
                return model_response
            else:
@ -929,6 +940,48 @@ class CustomStreamWrapper:
                self.chunks.append(model_response)
            return

+    def _optional_combine_thinking_block_in_choices(
+        self, model_response: ModelResponseStream
+    ) -> None:
+        """
+        UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
+
+        In place updates the model_response object with reasoning_content in content with <think>...</think> tags
+
+        Enabled when `merge_reasoning_content_in_choices=True` passed in request params
+
+
+        """
+        if self.merge_reasoning_content_in_choices is True:
+            reasoning_content = getattr(
+                model_response.choices[0].delta, "reasoning_content", None
+            )
+            if reasoning_content:
+                if self.sent_first_thinking_block is False:
+                    model_response.choices[0].delta.content += (
+                        "<think>" + reasoning_content
+                    )
+                    self.sent_first_thinking_block = True
+                elif (
+                    self.sent_first_thinking_block is True
+                    and hasattr(model_response.choices[0].delta, "reasoning_content")
+                    and model_response.choices[0].delta.reasoning_content
+                ):
+                    model_response.choices[0].delta.content = reasoning_content
+            elif (
+                self.sent_first_thinking_block is True
+                and not self.sent_last_thinking_block
+                and model_response.choices[0].delta.content
+            ):
+                model_response.choices[0].delta.content = (
+                    "</think>" + model_response.choices[0].delta.content
+                )
+                self.sent_last_thinking_block = True
+
+            if hasattr(model_response.choices[0].delta, "reasoning_content"):
+                del model_response.choices[0].delta.reasoning_content
+        return
+
    def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
        model_response = self.model_response_creator()
        response_obj: Dict[str, Any] = {}
--- a/litellm/main.py
+++ b/litellm/main.py
@ -1159,6 +1159,9 @@ def completion(  # type: ignore # noqa: PLR0915
            prompt_id=prompt_id,
            prompt_variables=prompt_variables,
            ssl_verify=ssl_verify,
+            merge_reasoning_content_in_choices=kwargs.get(
+                "merge_reasoning_content_in_choices", None
+            ),
        )
        logging.update_environment_variables(
            model=model,
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,27 +1,13 @@
 model_list:
-  - model_name: fake-openai-endpoint
+  - model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
    litellm_params:
-      model: openai/my-fake-model
-      api_key: my-fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: claude-special-alias
-    litellm_params:
-      model: anthropic/claude-3-haiku-20240307
-      api_key: os.environ/ANTHROPIC_API_KEY
-  - model_name: claude-3-5-sonnet-20241022
-    litellm_params:
-      model: anthropic/claude-3-5-sonnet-20241022
-      api_key: os.environ/ANTHROPIC_API_KEY
-  - model_name: claude-3-7-sonnet-20250219
-    litellm_params:
-      model: anthropic/claude-3-7-sonnet-20250219
-      api_key: os.environ/ANTHROPIC_API_KEY
-  - model_name: anthropic/*
-    litellm_params:
-      model: anthropic/*
-      api_key: os.environ/ANTHROPIC_API_KEY
+
+      model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+      thinking: {"type": "enabled", "budget_tokens": 1024}
+      max_tokens: 1080
+      merge_reasoning_content_in_choices: true
+

 general_settings:
  store_model_in_db: true
  store_prompts_in_spend_logs: true
-
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
    budget_duration: Optional[str] = None
    use_in_pass_through: Optional[bool] = False
    model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
+    merge_reasoning_content_in_choices: Optional[bool] = False
+    model_info: Optional[Dict] = None

    def __init__(
        self,
@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
        budget_duration: Optional[str] = None,
        # Pass through params
        use_in_pass_through: Optional[bool] = False,
+        # This will merge the reasoning content in the choices
+        merge_reasoning_content_in_choices: Optional[bool] = False,
+        model_info: Optional[Dict] = None,
        **params,
    ):
        args = locals()
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
 from typing_extensions import Callable, Dict, Required, TypedDict, override

+import litellm
+
 from ..litellm_core_utils.core_helpers import map_finish_reason
 from .guardrails import GuardrailEventHooks
 from .llms.openai import (
@ -1803,6 +1805,7 @@ all_litellm_params = [
    "max_budget",
    "budget_duration",
    "use_in_pass_through",
+    "merge_reasoning_content_in_choices",
 ] + list(StandardCallbackDynamicParams.__annotations__.keys())


--- a/tests/litellm/litellm_core_utils/test_streaming_handler.py
+++ b/tests/litellm/litellm_core_utils/test_streaming_handler.py
@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
        model_response=ModelResponseStream(**chunk),
        response_obj=MagicMock(),
    )
+
+
+def test_optional_combine_thinking_block_in_choices(
+    initialized_custom_stream_wrapper: CustomStreamWrapper,
+):
+    """Test that reasoning_content is properly combined with content using <think> tags"""
+    # Setup the wrapper to use the merge feature
+    initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
+
+    # First chunk with reasoning_content - should add <think> tag
+    first_chunk = {
+        "id": "chunk1",
+        "object": "chat.completion.chunk",
+        "created": 1741037890,
+        "model": "deepseek-reasoner",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "content": "",
+                    "reasoning_content": "Let me think about this",
+                },
+                "finish_reason": None,
+            }
+        ],
+    }
+
+    # Middle chunk with more reasoning_content
+    middle_chunk = {
+        "id": "chunk2",
+        "object": "chat.completion.chunk",
+        "created": 1741037891,
+        "model": "deepseek-reasoner",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"content": "", "reasoning_content": " step by step"},
+                "finish_reason": None,
+            }
+        ],
+    }
+
+    # Final chunk with actual content - should add </think> tag
+    final_chunk = {
+        "id": "chunk3",
+        "object": "chat.completion.chunk",
+        "created": 1741037892,
+        "model": "deepseek-reasoner",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"content": "The answer is 42", "reasoning_content": None},
+                "finish_reason": None,
+            }
+        ],
+    }
+
+    # Process first chunk
+    first_response = ModelResponseStream(**first_chunk)
+    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+        first_response
+    )
+    print("first_response", json.dumps(first_response, indent=4, default=str))
+    assert first_response.choices[0].delta.content == "<think>Let me think about this"
+    # assert the response does not have attribute reasoning_content
+    assert not hasattr(first_response.choices[0].delta, "reasoning_content")
+
+    assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
+
+    # Process middle chunk
+    middle_response = ModelResponseStream(**middle_chunk)
+    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+        middle_response
+    )
+    print("middle_response", json.dumps(middle_response, indent=4, default=str))
+    assert middle_response.choices[0].delta.content == " step by step"
+    assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
+
+    # Process final chunk
+    final_response = ModelResponseStream(**final_chunk)
+    initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+        final_response
+    )
+    print("final_response", json.dumps(final_response, indent=4, default=str))
+    assert final_response.choices[0].delta.content == "</think>The answer is 42"
+    assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
+    assert not hasattr(final_response.choices[0].delta, "reasoning_content")
+
+
+def test_multi_chunk_reasoning_and_content(
+    initialized_custom_stream_wrapper: CustomStreamWrapper,
+):
+    """Test handling of multiple reasoning chunks followed by multiple content chunks"""
+    # Setup the wrapper to use the merge feature
+    initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
+    initialized_custom_stream_wrapper.sent_first_thinking_block = False
+    initialized_custom_stream_wrapper.sent_last_thinking_block = False
+
+    # Create test chunks
+    chunks = [
+        # Chunk 1: First reasoning chunk
+        {
+            "id": "chunk1",
+            "object": "chat.completion.chunk",
+            "created": 1741037890,
+            "model": "deepseek-reasoner",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "",
+                        "reasoning_content": "To solve this problem",
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        # Chunk 2: Second reasoning chunk
+        {
+            "id": "chunk2",
+            "object": "chat.completion.chunk",
+            "created": 1741037891,
+            "model": "deepseek-reasoner",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "",
+                        "reasoning_content": ", I need to calculate 6 * 7",
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        # Chunk 3: Third reasoning chunk
+        {
+            "id": "chunk3",
+            "object": "chat.completion.chunk",
+            "created": 1741037892,
+            "model": "deepseek-reasoner",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": "", "reasoning_content": " which equals 42"},
+                    "finish_reason": None,
+                }
+            ],
+        },
+        # Chunk 4: First content chunk (transition from reasoning to content)
+        {
+            "id": "chunk4",
+            "object": "chat.completion.chunk",
+            "created": 1741037893,
+            "model": "deepseek-reasoner",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "The answer to your question",
+                        "reasoning_content": None,
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        # Chunk 5: Second content chunk
+        {
+            "id": "chunk5",
+            "object": "chat.completion.chunk",
+            "created": 1741037894,
+            "model": "deepseek-reasoner",
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": " is 42.", "reasoning_content": None},
+                    "finish_reason": None,
+                }
+            ],
+        },
+    ]
+
+    # Expected content after processing each chunk
+    expected_contents = [
+        "<think>To solve this problem",
+        ", I need to calculate 6 * 7",
+        " which equals 42",
+        "</think>The answer to your question",
+        " is 42.",
+    ]
+
+    # Process each chunk and verify results
+    for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
+        response = ModelResponseStream(**chunk)
+        initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+            response
+        )
+
+        # Check content
+        assert (
+            response.choices[0].delta.content == expected_content
+        ), f"Chunk {i+1}: content mismatch"
+
+        # Check reasoning_content was removed
+        assert not hasattr(
+            response.choices[0].delta, "reasoning_content"
+        ), f"Chunk {i+1}: reasoning_content should be removed"
+
+    # Verify final state
+    assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
+    assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
            "Alright, let's get started with resolving this issue about implementing"
            in json_data
        )
+
+
+@pytest.mark.asyncio
+async def test_bedrock_stream_thinking_content_openwebui():
+    """
+    When merge_reasoning_content_in_choices=True
+
+    The content should be collected as
+
+    ```
+    <think>
+    I am a helpful assistant, the user wants to know who I am
+    </think>
+
+    Hi I am Anthropic, I am a helpful assistant
+
+    ```
+    """
+    response = await litellm.acompletion(
+        model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+        messages=[{"role": "user", "content": "Hello who is this?"}],
+        stream=True,
+        max_tokens=1080,
+        thinking={"type": "enabled", "budget_tokens": 1024},
+        merge_reasoning_content_in_choices=True,
+    )
+    content = ""
+    async for chunk in response:
+        content += chunk.choices[0].delta.content or ""
+
+        # OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
+        assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
+        print(chunk)
+
+    print("collected content", content)
+
+    # Assert that the content follows the expected format with exactly one thinking section
+    think_open_pos = content.find("<think>")
+    think_close_pos = content.find("</think>")
+
+    # Assert there's exactly one opening and closing tag
+    assert think_open_pos >= 0, "Opening <think> tag not found"
+    assert think_close_pos > 0, "Closing </think> tag not found"
+    assert (
+        content.count("<think>") == 1
+    ), "There should be exactly one opening <think> tag"
+    assert (
+        content.count("</think>") == 1
+    ), "There should be exactly one closing </think> tag"
+
+    # Assert the opening tag comes before the closing tag
+    assert (
+        think_open_pos < think_close_pos
+    ), "Opening tag should come before closing tag"
+
+    # Assert there's content between the tags
+    thinking_content = content[think_open_pos + 7 : think_close_pos]
+    assert (
+        len(thinking_content.strip()) > 0
+    ), "There should be content between thinking tags"
+
+    # Assert there's content after the closing tag
+    assert (
+        len(content) > think_close_pos + 8
+    ), "There should be content after the thinking tags"
+    response_content = content[think_close_pos + 8 :].strip()
+    assert (
+        len(response_content) > 0
+    ), "There should be non-empty content after thinking tags"