[Feat] - Display thinking tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) (#9029)
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 14s

* if merge_reasoning_content_in_choices

* _optional_combine_thinking_block_in_choices

* stash changes

* working merge_reasoning_content_in_choices with bedrock

* fix litellm_params accessor

* fix streaming handler

* merge_reasoning_content_in_choices

* _optional_combine_thinking_block_in_choices

* test_bedrock_stream_thinking_content_openwebui

* merge_reasoning_content_in_choices

* fix for _optional_combine_thinking_block_in_choices

* linting error fix
This commit is contained in:
Ishaan Jaff 2025-03-06 18:32:58 -08:00 committed by GitHub
parent 85d1427710
commit b02af305de
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 358 additions and 29 deletions

View file

@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None
force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
)

View file

@ -57,6 +57,7 @@ def get_litellm_params(
prompt_variables: Optional[dict] = None,
async_call: Optional[bool] = None,
ssl_verify: Optional[bool] = None,
merge_reasoning_content_in_choices: Optional[bool] = None,
**kwargs,
) -> dict:
litellm_params = {
@ -97,5 +98,6 @@ def get_litellm_params(
"prompt_variables": prompt_variables,
"async_call": async_call,
"ssl_verify": ssl_verify,
"merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
}
return litellm_params

View file

@ -15,6 +15,7 @@ from litellm import verbose_logger
from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.types.llms.openai import ChatCompletionChunk
from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import Delta
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import (
@ -70,6 +71,17 @@ class CustomStreamWrapper:
self.completion_stream = completion_stream
self.sent_first_chunk = False
self.sent_last_chunk = False
litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
**self.logging_obj.model_call_details.get("litellm_params", {})
)
self.merge_reasoning_content_in_choices: bool = (
litellm_params.merge_reasoning_content_in_choices or False
)
self.sent_first_thinking_block = False
self.sent_last_thinking_block = False
self.thinking_content = ""
self.system_fingerprint: Optional[str] = None
self.received_finish_reason: Optional[str] = None
self.intermittent_finish_reason: Optional[str] = (
@ -87,12 +99,7 @@ class CustomStreamWrapper:
self.holding_chunk = ""
self.complete_response = ""
self.response_uptil_now = ""
_model_info = (
self.logging_obj.model_call_details.get("litellm_params", {}).get(
"model_info", {}
)
or {}
)
_model_info: Dict = litellm_params.model_info or {}
_api_base = get_api_base(
model=model or "",
@ -873,6 +880,10 @@ class CustomStreamWrapper:
_index: Optional[int] = completion_obj.get("index")
if _index is not None:
model_response.choices[0].index = _index
self._optional_combine_thinking_block_in_choices(
model_response=model_response
)
print_verbose(f"returning model_response: {model_response}")
return model_response
else:
@ -929,6 +940,48 @@ class CustomStreamWrapper:
self.chunks.append(model_response)
return
def _optional_combine_thinking_block_in_choices(
self, model_response: ModelResponseStream
) -> None:
"""
UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
In place updates the model_response object with reasoning_content in content with <think>...</think> tags
Enabled when `merge_reasoning_content_in_choices=True` passed in request params
"""
if self.merge_reasoning_content_in_choices is True:
reasoning_content = getattr(
model_response.choices[0].delta, "reasoning_content", None
)
if reasoning_content:
if self.sent_first_thinking_block is False:
model_response.choices[0].delta.content += (
"<think>" + reasoning_content
)
self.sent_first_thinking_block = True
elif (
self.sent_first_thinking_block is True
and hasattr(model_response.choices[0].delta, "reasoning_content")
and model_response.choices[0].delta.reasoning_content
):
model_response.choices[0].delta.content = reasoning_content
elif (
self.sent_first_thinking_block is True
and not self.sent_last_thinking_block
and model_response.choices[0].delta.content
):
model_response.choices[0].delta.content = (
"</think>" + model_response.choices[0].delta.content
)
self.sent_last_thinking_block = True
if hasattr(model_response.choices[0].delta, "reasoning_content"):
del model_response.choices[0].delta.reasoning_content
return
def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915
model_response = self.model_response_creator()
response_obj: Dict[str, Any] = {}

View file

@ -1159,6 +1159,9 @@ def completion( # type: ignore # noqa: PLR0915
prompt_id=prompt_id,
prompt_variables=prompt_variables,
ssl_verify=ssl_verify,
merge_reasoning_content_in_choices=kwargs.get(
"merge_reasoning_content_in_choices", None
),
)
logging.update_environment_variables(
model=model,

View file

@ -1,27 +1,13 @@
model_list:
- model_name: fake-openai-endpoint
- model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: claude-special-alias
litellm_params:
model: anthropic/claude-3-haiku-20240307
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-5-sonnet-20241022
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-7-sonnet-20250219
litellm_params:
model: anthropic/claude-3-7-sonnet-20250219
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: anthropic/*
litellm_params:
model: anthropic/*
api_key: os.environ/ANTHROPIC_API_KEY
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
thinking: {"type": "enabled", "budget_tokens": 1024}
max_tokens: 1080
merge_reasoning_content_in_choices: true
general_settings:
store_model_in_db: true
store_prompts_in_spend_logs: true

View file

@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None
use_in_pass_through: Optional[bool] = False
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
merge_reasoning_content_in_choices: Optional[bool] = False
model_info: Optional[Dict] = None
def __init__(
self,
@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None,
# Pass through params
use_in_pass_through: Optional[bool] = False,
# This will merge the reasoning content in the choices
merge_reasoning_content_in_choices: Optional[bool] = False,
model_info: Optional[Dict] = None,
**params,
):
args = locals()

View file

@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from typing_extensions import Callable, Dict, Required, TypedDict, override
import litellm
from ..litellm_core_utils.core_helpers import map_finish_reason
from .guardrails import GuardrailEventHooks
from .llms.openai import (
@ -1803,6 +1805,7 @@ all_litellm_params = [
"max_budget",
"budget_duration",
"use_in_pass_through",
"merge_reasoning_content_in_choices",
] + list(StandardCallbackDynamicParams.__annotations__.keys())

View file

@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
model_response=ModelResponseStream(**chunk),
response_obj=MagicMock(),
)
def test_optional_combine_thinking_block_in_choices(
initialized_custom_stream_wrapper: CustomStreamWrapper,
):
"""Test that reasoning_content is properly combined with content using <think> tags"""
# Setup the wrapper to use the merge feature
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
# First chunk with reasoning_content - should add <think> tag
first_chunk = {
"id": "chunk1",
"object": "chat.completion.chunk",
"created": 1741037890,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": "Let me think about this",
},
"finish_reason": None,
}
],
}
# Middle chunk with more reasoning_content
middle_chunk = {
"id": "chunk2",
"object": "chat.completion.chunk",
"created": 1741037891,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "", "reasoning_content": " step by step"},
"finish_reason": None,
}
],
}
# Final chunk with actual content - should add </think> tag
final_chunk = {
"id": "chunk3",
"object": "chat.completion.chunk",
"created": 1741037892,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "The answer is 42", "reasoning_content": None},
"finish_reason": None,
}
],
}
# Process first chunk
first_response = ModelResponseStream(**first_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
first_response
)
print("first_response", json.dumps(first_response, indent=4, default=str))
assert first_response.choices[0].delta.content == "<think>Let me think about this"
# assert the response does not have attribute reasoning_content
assert not hasattr(first_response.choices[0].delta, "reasoning_content")
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
# Process middle chunk
middle_response = ModelResponseStream(**middle_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
middle_response
)
print("middle_response", json.dumps(middle_response, indent=4, default=str))
assert middle_response.choices[0].delta.content == " step by step"
assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
# Process final chunk
final_response = ModelResponseStream(**final_chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
final_response
)
print("final_response", json.dumps(final_response, indent=4, default=str))
assert final_response.choices[0].delta.content == "</think>The answer is 42"
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
assert not hasattr(final_response.choices[0].delta, "reasoning_content")
def test_multi_chunk_reasoning_and_content(
initialized_custom_stream_wrapper: CustomStreamWrapper,
):
"""Test handling of multiple reasoning chunks followed by multiple content chunks"""
# Setup the wrapper to use the merge feature
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
initialized_custom_stream_wrapper.sent_first_thinking_block = False
initialized_custom_stream_wrapper.sent_last_thinking_block = False
# Create test chunks
chunks = [
# Chunk 1: First reasoning chunk
{
"id": "chunk1",
"object": "chat.completion.chunk",
"created": 1741037890,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": "To solve this problem",
},
"finish_reason": None,
}
],
},
# Chunk 2: Second reasoning chunk
{
"id": "chunk2",
"object": "chat.completion.chunk",
"created": 1741037891,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "",
"reasoning_content": ", I need to calculate 6 * 7",
},
"finish_reason": None,
}
],
},
# Chunk 3: Third reasoning chunk
{
"id": "chunk3",
"object": "chat.completion.chunk",
"created": 1741037892,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": "", "reasoning_content": " which equals 42"},
"finish_reason": None,
}
],
},
# Chunk 4: First content chunk (transition from reasoning to content)
{
"id": "chunk4",
"object": "chat.completion.chunk",
"created": 1741037893,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {
"content": "The answer to your question",
"reasoning_content": None,
},
"finish_reason": None,
}
],
},
# Chunk 5: Second content chunk
{
"id": "chunk5",
"object": "chat.completion.chunk",
"created": 1741037894,
"model": "deepseek-reasoner",
"choices": [
{
"index": 0,
"delta": {"content": " is 42.", "reasoning_content": None},
"finish_reason": None,
}
],
},
]
# Expected content after processing each chunk
expected_contents = [
"<think>To solve this problem",
", I need to calculate 6 * 7",
" which equals 42",
"</think>The answer to your question",
" is 42.",
]
# Process each chunk and verify results
for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
response = ModelResponseStream(**chunk)
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
response
)
# Check content
assert (
response.choices[0].delta.content == expected_content
), f"Chunk {i+1}: content mismatch"
# Check reasoning_content was removed
assert not hasattr(
response.choices[0].delta, "reasoning_content"
), f"Chunk {i+1}: reasoning_content should be removed"
# Verify final state
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True

View file

@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
"Alright, let's get started with resolving this issue about implementing"
in json_data
)
@pytest.mark.asyncio
async def test_bedrock_stream_thinking_content_openwebui():
"""
When merge_reasoning_content_in_choices=True
The content should be collected as
```
<think>
I am a helpful assistant, the user wants to know who I am
</think>
Hi I am Anthropic, I am a helpful assistant
```
"""
response = await litellm.acompletion(
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
messages=[{"role": "user", "content": "Hello who is this?"}],
stream=True,
max_tokens=1080,
thinking={"type": "enabled", "budget_tokens": 1024},
merge_reasoning_content_in_choices=True,
)
content = ""
async for chunk in response:
content += chunk.choices[0].delta.content or ""
# OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
print(chunk)
print("collected content", content)
# Assert that the content follows the expected format with exactly one thinking section
think_open_pos = content.find("<think>")
think_close_pos = content.find("</think>")
# Assert there's exactly one opening and closing tag
assert think_open_pos >= 0, "Opening <think> tag not found"
assert think_close_pos > 0, "Closing </think> tag not found"
assert (
content.count("<think>") == 1
), "There should be exactly one opening <think> tag"
assert (
content.count("</think>") == 1
), "There should be exactly one closing </think> tag"
# Assert the opening tag comes before the closing tag
assert (
think_open_pos < think_close_pos
), "Opening tag should come before closing tag"
# Assert there's content between the tags
thinking_content = content[think_open_pos + 7 : think_close_pos]
assert (
len(thinking_content.strip()) > 0
), "There should be content between thinking tags"
# Assert there's content after the closing tag
assert (
len(content) > think_close_pos + 8
), "There should be content after the thinking tags"
response_content = content[think_close_pos + 8 :].strip()
assert (
len(response_content) > 0
), "There should be non-empty content after thinking tags"