[Feat] - Display thinking tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) (#9029)

* if merge_reasoning_content_in_choices

* _optional_combine_thinking_block_in_choices

* stash changes

* working merge_reasoning_content_in_choices with bedrock

* fix litellm_params accessor

* fix streaming handler

* merge_reasoning_content_in_choices

* _optional_combine_thinking_block_in_choices

* test_bedrock_stream_thinking_content_openwebui

* merge_reasoning_content_in_choices

* fix for _optional_combine_thinking_block_in_choices

* linting error fix
This commit is contained in:
Ishaan Jaff 2025-03-06 18:32:58 -08:00 committed by GitHub
parent d5c20188d7
commit bfbbac38fc
9 changed files with 358 additions and 29 deletions

View file

@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = [] custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION #### #### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None priority_reservation: Optional[Dict[str, float]] = None
force_ipv4: bool = ( force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
) )

View file

@ -57,6 +57,7 @@ def get_litellm_params(
prompt_variables: Optional[dict] = None, prompt_variables: Optional[dict] = None,
async_call: Optional[bool] = None, async_call: Optional[bool] = None,
ssl_verify: Optional[bool] = None, ssl_verify: Optional[bool] = None,
merge_reasoning_content_in_choices: Optional[bool] = None,
**kwargs, **kwargs,
) -> dict: ) -> dict:
litellm_params = { litellm_params = {
@ -97,5 +98,6 @@ def get_litellm_params(
"prompt_variables": prompt_variables, "prompt_variables": prompt_variables,
"async_call": async_call, "async_call": async_call,
"ssl_verify": ssl_verify, "ssl_verify": ssl_verify,
"merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
} }
return litellm_params return litellm_params

View file

@ -15,6 +15,7 @@ from litellm import verbose_logger
from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.types.llms.openai import ChatCompletionChunk from litellm.types.llms.openai import ChatCompletionChunk
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import Delta from litellm.types.utils import Delta
from litellm.types.utils import GenericStreamingChunk as GChunk from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import ( from litellm.types.utils import (
@ -70,6 +71,17 @@ class CustomStreamWrapper:
self.completion_stream = completion_stream self.completion_stream = completion_stream
self.sent_first_chunk = False self.sent_first_chunk = False
self.sent_last_chunk = False self.sent_last_chunk = False
litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
**self.logging_obj.model_call_details.get("litellm_params", {})
)
self.merge_reasoning_content_in_choices: bool = (
litellm_params.merge_reasoning_content_in_choices or False
)
self.sent_first_thinking_block = False
self.sent_last_thinking_block = False
self.thinking_content = ""
self.system_fingerprint: Optional[str] = None self.system_fingerprint: Optional[str] = None
self.received_finish_reason: Optional[str] = None self.received_finish_reason: Optional[str] = None
self.intermittent_finish_reason: Optional[str] = ( self.intermittent_finish_reason: Optional[str] = (
@ -87,12 +99,7 @@ class CustomStreamWrapper:
self.holding_chunk = "" self.holding_chunk = ""
self.complete_response = "" self.complete_response = ""
self.response_uptil_now = "" self.response_uptil_now = ""
_model_info = ( _model_info: Dict = litellm_params.model_info or {}
self.logging_obj.model_call_details.get("litellm_params", {}).get(
"model_info", {}
)
or {}
)
_api_base = get_api_base( _api_base = get_api_base(
model=model or "", model=model or "",
@ -873,6 +880,10 @@ class CustomStreamWrapper:
_index: Optional[int] = completion_obj.get("index") _index: Optional[int] = completion_obj.get("index")
if _index is not None: if _index is not None:
model_response.choices[0].index = _index model_response.choices[0].index = _index
self._optional_combine_thinking_block_in_choices(
model_response=model_response
)
print_verbose(f"returning model_response: {model_response}") print_verbose(f"returning model_response: {model_response}")
return model_response return model_response
else: else:
@ -929,6 +940,48 @@ class CustomStreamWrapper:
self.chunks.append(model_response) self.chunks.append(model_response)
return return
def _optional_combine_thinking_block_in_choices(
self, model_response: ModelResponseStream
) -> None:
"""
UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
In place updates the model_response object with reasoning_content in content with <think>...</think> tags
Enabled when `merge_reasoning_content_in_choices=True` passed in request params
"""
if self.merge_reasoning_content_in_choices is True:
reasoning_content = getattr(
model_response.choices[0].delta, "reasoning_content", None
)
if reasoning_content:
if self.sent_first_thinking_block is False:
model_response.choices[0].delta.content += (
"<think>" + reasoning_content
)
self.sent_first_thinking_block = True
elif (
self.sent_first_thinking_block is True
and hasattr(model_response.choices[0].delta, "reasoning_content")
and model_response.choices[0].delta.reasoning_content
):
model_response.choices[0].delta.content = reasoning_content
elif (
self.sent_first_thinking_block is True
and not self.sent_last_thinking_block
and model_response.choices[0].delta.content
):
model_response.choices[0].delta.content = (
"</think>" + model_response.choices[0].delta.content
)
self.sent_last_thinking_block = True
if hasattr(model_response.choices[0].delta, "reasoning_content"):
del model_response.choices[0].delta.reasoning_content
return
def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915 def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915
model_response = self.model_response_creator() model_response = self.model_response_creator()
response_obj: Dict[str, Any] = {} response_obj: Dict[str, Any] = {}

View file

@ -1159,6 +1159,9 @@ def completion( # type: ignore # noqa: PLR0915
prompt_id=prompt_id, prompt_id=prompt_id,
prompt_variables=prompt_variables, prompt_variables=prompt_variables,
ssl_verify=ssl_verify, ssl_verify=ssl_verify,
merge_reasoning_content_in_choices=kwargs.get(
"merge_reasoning_content_in_choices", None
),
) )
logging.update_environment_variables( logging.update_environment_variables(
model=model, model=model,

View file

@ -1,27 +1,13 @@
model_list: model_list:
- model_name: fake-openai-endpoint - model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
litellm_params: litellm_params:
model: openai/my-fake-model
api_key: my-fake-key model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
api_base: https://exampleopenaiendpoint-production.up.railway.app/ thinking: {"type": "enabled", "budget_tokens": 1024}
- model_name: claude-special-alias max_tokens: 1080
litellm_params: merge_reasoning_content_in_choices: true
model: anthropic/claude-3-haiku-20240307
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-5-sonnet-20241022
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: anthropic/*
litellm_params:
model: anthropic/*
api_key: os.environ/ANTHROPIC_API_KEY
general_settings: general_settings:
store_model_in_db: true store_model_in_db: true
store_prompts_in_spend_logs: true store_prompts_in_spend_logs: true

View file

@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None budget_duration: Optional[str] = None
use_in_pass_through: Optional[bool] = False use_in_pass_through: Optional[bool] = False
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
merge_reasoning_content_in_choices: Optional[bool] = False
model_info: Optional[Dict] = None
def __init__( def __init__(
self, self,
@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None, budget_duration: Optional[str] = None,
# Pass through params # Pass through params
use_in_pass_through: Optional[bool] = False, use_in_pass_through: Optional[bool] = False,
# This will merge the reasoning content in the choices
merge_reasoning_content_in_choices: Optional[bool] = False,
model_info: Optional[Dict] = None,
**params, **params,
): ):
args = locals() args = locals()

View file

@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from typing_extensions import Callable, Dict, Required, TypedDict, override from typing_extensions import Callable, Dict, Required, TypedDict, override
import litellm
from ..litellm_core_utils.core_helpers import map_finish_reason from ..litellm_core_utils.core_helpers import map_finish_reason
from .guardrails import GuardrailEventHooks from .guardrails import GuardrailEventHooks
from .llms.openai import ( from .llms.openai import (
@ -1803,6 +1805,7 @@ all_litellm_params = [
"max_budget", "max_budget",
"budget_duration", "budget_duration",
"use_in_pass_through", "use_in_pass_through",
"merge_reasoning_content_in_choices",
] + list(StandardCallbackDynamicParams.__annotations__.keys()) ] + list(StandardCallbackDynamicParams.__annotations__.keys())

View file

@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
model_response=ModelResponseStream(**chunk), model_response=ModelResponseStream(**chunk),
response_obj=MagicMock(), response_obj=MagicMock(),
) )
def test_optional_combine_thinking_block_in_choices(
initialized_custom_stream_wrapper: CustomStreamWrapper,
):
"""Test that reasoning_content is properly combined with content using <think> tags"""
# Setup the wrapper to use the merge feature
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
# First chunk with reasoning_content - should add <think> tag
first_chunk = {
"id": "chunk1",
"object": "chat.completion.chunk",
"created": 1741037890,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": "Let me think about this",
},
"finish_reason": None,
}
],
}
# Middle chunk with more reasoning_content
middle_chunk = {
"id": "chunk2",
"object": "chat.completion.chunk",
"created": 1741037891,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "", "reasoning_content": " step by step"},
"finish_reason": None,
}
],
}
# Final chunk with actual content - should add </think> tag
final_chunk = {
"id": "chunk3",
"object": "chat.completion.chunk",
"created": 1741037892,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "The answer is 42", "reasoning_content": None},
"finish_reason": None,
}
],
}
# Process first chunk
first_response = ModelResponseStream(**first_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
first_response
)
print("first_response", json.dumps(first_response, indent=4, default=str))
assert first_response.choices[0].delta.content == "<think>Let me think about this"
# assert the response does not have attribute reasoning_content
assert not hasattr(first_response.choices[0].delta, "reasoning_content")
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
# Process middle chunk
middle_response = ModelResponseStream(**middle_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
middle_response
)
print("middle_response", json.dumps(middle_response, indent=4, default=str))
assert middle_response.choices[0].delta.content == " step by step"
assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
# Process final chunk
final_response = ModelResponseStream(**final_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
final_response
)
print("final_response", json.dumps(final_response, indent=4, default=str))
assert final_response.choices[0].delta.content == "</think>The answer is 42"
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
assert not hasattr(final_response.choices[0].delta, "reasoning_content")
def test_multi_chunk_reasoning_and_content(
initialized_custom_stream_wrapper: CustomStreamWrapper,
):
"""Test handling of multiple reasoning chunks followed by multiple content chunks"""
# Setup the wrapper to use the merge feature
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
initialized_custom_stream_wrapper.sent_first_thinking_block = False
initialized_custom_stream_wrapper.sent_last_thinking_block = False
# Create test chunks
chunks = [
# Chunk 1: First reasoning chunk
{
"id": "chunk1",
"object": "chat.completion.chunk",
"created": 1741037890,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": "To solve this problem",
},
"finish_reason": None,
}
],
},
# Chunk 2: Second reasoning chunk
{
"id": "chunk2",
"object": "chat.completion.chunk",
"created": 1741037891,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": ", I need to calculate 6 * 7",
},
"finish_reason": None,
}
],
},
# Chunk 3: Third reasoning chunk
{
"id": "chunk3",
"object": "chat.completion.chunk",
"created": 1741037892,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "", "reasoning_content": " which equals 42"},
"finish_reason": None,
}
],
},
# Chunk 4: First content chunk (transition from reasoning to content)
{
"id": "chunk4",
"object": "chat.completion.chunk",
"created": 1741037893,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "The answer to your question",
"reasoning_content": None,
},
"finish_reason": None,
}
],
},
# Chunk 5: Second content chunk
{
"id": "chunk5",
"object": "chat.completion.chunk",
"created": 1741037894,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": " is 42.", "reasoning_content": None},
"finish_reason": None,
}
],
},
]
# Expected content after processing each chunk
expected_contents = [
"<think>To solve this problem",
", I need to calculate 6 * 7",
" which equals 42",
"</think>The answer to your question",
" is 42.",
]
# Process each chunk and verify results
for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
response = ModelResponseStream(**chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
response
)
# Check content
assert (
response.choices[0].delta.content == expected_content
), f"Chunk {i+1}: content mismatch"
# Check reasoning_content was removed
assert not hasattr(
response.choices[0].delta, "reasoning_content"
), f"Chunk {i+1}: reasoning_content should be removed"
# Verify final state
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True

View file

@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
"Alright, let's get started with resolving this issue about implementing" "Alright, let's get started with resolving this issue about implementing"
in json_data in json_data
) )
@pytest.mark.asyncio
async def test_bedrock_stream_thinking_content_openwebui():
"""
When merge_reasoning_content_in_choices=True
The content should be collected as
```
<think>
I am a helpful assistant, the user wants to know who I am
</think>
Hi I am Anthropic, I am a helpful assistant
```
"""
response = await litellm.acompletion(
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
messages=[{"role": "user", "content": "Hello who is this?"}],
stream=True,
max_tokens=1080,
thinking={"type": "enabled", "budget_tokens": 1024},
merge_reasoning_content_in_choices=True,
)
content = ""
async for chunk in response:
content += chunk.choices[0].delta.content or ""
# OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
print(chunk)
print("collected content", content)
# Assert that the content follows the expected format with exactly one thinking section
think_open_pos = content.find("<think>")
think_close_pos = content.find("</think>")
# Assert there's exactly one opening and closing tag
assert think_open_pos >= 0, "Opening <think> tag not found"
assert think_close_pos > 0, "Closing </think> tag not found"
assert (
content.count("<think>") == 1
), "There should be exactly one opening <think> tag"
assert (
content.count("</think>") == 1
), "There should be exactly one closing </think> tag"
# Assert the opening tag comes before the closing tag
assert (
think_open_pos < think_close_pos
), "Opening tag should come before closing tag"
# Assert there's content between the tags
thinking_content = content[think_open_pos + 7 : think_close_pos]
assert (
len(thinking_content.strip()) > 0
), "There should be content between thinking tags"
# Assert there's content after the closing tag
assert (
len(content) > think_close_pos + 8
), "There should be content after the thinking tags"
response_content = content[think_close_pos + 8 :].strip()
assert (
len(response_content) > 0
), "There should be non-empty content after thinking tags"