mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
[Feat] - Display thinking
tokens on OpenWebUI (Bedrock, Anthropic, Deepseek) (#9029)
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 14s
All checks were successful
Read Version from pyproject.toml / read-version (push) Successful in 14s
* if merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * stash changes * working merge_reasoning_content_in_choices with bedrock * fix litellm_params accessor * fix streaming handler * merge_reasoning_content_in_choices * _optional_combine_thinking_block_in_choices * test_bedrock_stream_thinking_content_openwebui * merge_reasoning_content_in_choices * fix for _optional_combine_thinking_block_in_choices * linting error fix
This commit is contained in:
parent
85d1427710
commit
b02af305de
9 changed files with 358 additions and 29 deletions
|
@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
|
|||
custom_prometheus_metadata_labels: List[str] = []
|
||||
#### REQUEST PRIORITIZATION ####
|
||||
priority_reservation: Optional[Dict[str, float]] = None
|
||||
|
||||
|
||||
force_ipv4: bool = (
|
||||
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
|
||||
)
|
||||
|
|
|
@ -57,6 +57,7 @@ def get_litellm_params(
|
|||
prompt_variables: Optional[dict] = None,
|
||||
async_call: Optional[bool] = None,
|
||||
ssl_verify: Optional[bool] = None,
|
||||
merge_reasoning_content_in_choices: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
litellm_params = {
|
||||
|
@ -97,5 +98,6 @@ def get_litellm_params(
|
|||
"prompt_variables": prompt_variables,
|
||||
"async_call": async_call,
|
||||
"ssl_verify": ssl_verify,
|
||||
"merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
|
||||
}
|
||||
return litellm_params
|
||||
|
|
|
@ -15,6 +15,7 @@ from litellm import verbose_logger
|
|||
from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
|
||||
from litellm.litellm_core_utils.thread_pool_executor import executor
|
||||
from litellm.types.llms.openai import ChatCompletionChunk
|
||||
from litellm.types.router import GenericLiteLLMParams
|
||||
from litellm.types.utils import Delta
|
||||
from litellm.types.utils import GenericStreamingChunk as GChunk
|
||||
from litellm.types.utils import (
|
||||
|
@ -70,6 +71,17 @@ class CustomStreamWrapper:
|
|||
self.completion_stream = completion_stream
|
||||
self.sent_first_chunk = False
|
||||
self.sent_last_chunk = False
|
||||
|
||||
litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
|
||||
**self.logging_obj.model_call_details.get("litellm_params", {})
|
||||
)
|
||||
self.merge_reasoning_content_in_choices: bool = (
|
||||
litellm_params.merge_reasoning_content_in_choices or False
|
||||
)
|
||||
self.sent_first_thinking_block = False
|
||||
self.sent_last_thinking_block = False
|
||||
self.thinking_content = ""
|
||||
|
||||
self.system_fingerprint: Optional[str] = None
|
||||
self.received_finish_reason: Optional[str] = None
|
||||
self.intermittent_finish_reason: Optional[str] = (
|
||||
|
@ -87,12 +99,7 @@ class CustomStreamWrapper:
|
|||
self.holding_chunk = ""
|
||||
self.complete_response = ""
|
||||
self.response_uptil_now = ""
|
||||
_model_info = (
|
||||
self.logging_obj.model_call_details.get("litellm_params", {}).get(
|
||||
"model_info", {}
|
||||
)
|
||||
or {}
|
||||
)
|
||||
_model_info: Dict = litellm_params.model_info or {}
|
||||
|
||||
_api_base = get_api_base(
|
||||
model=model or "",
|
||||
|
@ -873,6 +880,10 @@ class CustomStreamWrapper:
|
|||
_index: Optional[int] = completion_obj.get("index")
|
||||
if _index is not None:
|
||||
model_response.choices[0].index = _index
|
||||
|
||||
self._optional_combine_thinking_block_in_choices(
|
||||
model_response=model_response
|
||||
)
|
||||
print_verbose(f"returning model_response: {model_response}")
|
||||
return model_response
|
||||
else:
|
||||
|
@ -929,6 +940,48 @@ class CustomStreamWrapper:
|
|||
self.chunks.append(model_response)
|
||||
return
|
||||
|
||||
def _optional_combine_thinking_block_in_choices(
|
||||
self, model_response: ModelResponseStream
|
||||
) -> None:
|
||||
"""
|
||||
UI's Like OpenWebUI expect to get 1 chunk with <think>...</think> tags in the chunk content
|
||||
|
||||
In place updates the model_response object with reasoning_content in content with <think>...</think> tags
|
||||
|
||||
Enabled when `merge_reasoning_content_in_choices=True` passed in request params
|
||||
|
||||
|
||||
"""
|
||||
if self.merge_reasoning_content_in_choices is True:
|
||||
reasoning_content = getattr(
|
||||
model_response.choices[0].delta, "reasoning_content", None
|
||||
)
|
||||
if reasoning_content:
|
||||
if self.sent_first_thinking_block is False:
|
||||
model_response.choices[0].delta.content += (
|
||||
"<think>" + reasoning_content
|
||||
)
|
||||
self.sent_first_thinking_block = True
|
||||
elif (
|
||||
self.sent_first_thinking_block is True
|
||||
and hasattr(model_response.choices[0].delta, "reasoning_content")
|
||||
and model_response.choices[0].delta.reasoning_content
|
||||
):
|
||||
model_response.choices[0].delta.content = reasoning_content
|
||||
elif (
|
||||
self.sent_first_thinking_block is True
|
||||
and not self.sent_last_thinking_block
|
||||
and model_response.choices[0].delta.content
|
||||
):
|
||||
model_response.choices[0].delta.content = (
|
||||
"</think>" + model_response.choices[0].delta.content
|
||||
)
|
||||
self.sent_last_thinking_block = True
|
||||
|
||||
if hasattr(model_response.choices[0].delta, "reasoning_content"):
|
||||
del model_response.choices[0].delta.reasoning_content
|
||||
return
|
||||
|
||||
def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915
|
||||
model_response = self.model_response_creator()
|
||||
response_obj: Dict[str, Any] = {}
|
||||
|
|
|
@ -1159,6 +1159,9 @@ def completion( # type: ignore # noqa: PLR0915
|
|||
prompt_id=prompt_id,
|
||||
prompt_variables=prompt_variables,
|
||||
ssl_verify=ssl_verify,
|
||||
merge_reasoning_content_in_choices=kwargs.get(
|
||||
"merge_reasoning_content_in_choices", None
|
||||
),
|
||||
)
|
||||
logging.update_environment_variables(
|
||||
model=model,
|
||||
|
|
|
@ -1,27 +1,13 @@
|
|||
model_list:
|
||||
- model_name: fake-openai-endpoint
|
||||
- model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
||||
litellm_params:
|
||||
model: openai/my-fake-model
|
||||
api_key: my-fake-key
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
- model_name: claude-special-alias
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-haiku-20240307
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
- model_name: claude-3-5-sonnet-20241022
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-5-sonnet-20241022
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
- model_name: claude-3-7-sonnet-20250219
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-7-sonnet-20250219
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
- model_name: anthropic/*
|
||||
litellm_params:
|
||||
model: anthropic/*
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
||||
thinking: {"type": "enabled", "budget_tokens": 1024}
|
||||
max_tokens: 1080
|
||||
merge_reasoning_content_in_choices: true
|
||||
|
||||
|
||||
general_settings:
|
||||
store_model_in_db: true
|
||||
store_prompts_in_spend_logs: true
|
||||
|
||||
|
|
|
@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
|
|||
budget_duration: Optional[str] = None
|
||||
use_in_pass_through: Optional[bool] = False
|
||||
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
|
||||
merge_reasoning_content_in_choices: Optional[bool] = False
|
||||
model_info: Optional[Dict] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
|
|||
budget_duration: Optional[str] = None,
|
||||
# Pass through params
|
||||
use_in_pass_through: Optional[bool] = False,
|
||||
# This will merge the reasoning content in the choices
|
||||
merge_reasoning_content_in_choices: Optional[bool] = False,
|
||||
model_info: Optional[Dict] = None,
|
||||
**params,
|
||||
):
|
||||
args = locals()
|
||||
|
|
|
@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
|
|||
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
||||
from typing_extensions import Callable, Dict, Required, TypedDict, override
|
||||
|
||||
import litellm
|
||||
|
||||
from ..litellm_core_utils.core_helpers import map_finish_reason
|
||||
from .guardrails import GuardrailEventHooks
|
||||
from .llms.openai import (
|
||||
|
@ -1803,6 +1805,7 @@ all_litellm_params = [
|
|||
"max_budget",
|
||||
"budget_duration",
|
||||
"use_in_pass_through",
|
||||
"merge_reasoning_content_in_choices",
|
||||
] + list(StandardCallbackDynamicParams.__annotations__.keys())
|
||||
|
||||
|
||||
|
|
|
@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
|
|||
model_response=ModelResponseStream(**chunk),
|
||||
response_obj=MagicMock(),
|
||||
)
|
||||
|
||||
|
||||
def test_optional_combine_thinking_block_in_choices(
|
||||
initialized_custom_stream_wrapper: CustomStreamWrapper,
|
||||
):
|
||||
"""Test that reasoning_content is properly combined with content using <think> tags"""
|
||||
# Setup the wrapper to use the merge feature
|
||||
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
|
||||
|
||||
# First chunk with reasoning_content - should add <think> tag
|
||||
first_chunk = {
|
||||
"id": "chunk1",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037890,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {
|
||||
"content": "",
|
||||
"reasoning_content": "Let me think about this",
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Middle chunk with more reasoning_content
|
||||
middle_chunk = {
|
||||
"id": "chunk2",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037891,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "", "reasoning_content": " step by step"},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Final chunk with actual content - should add </think> tag
|
||||
final_chunk = {
|
||||
"id": "chunk3",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037892,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "The answer is 42", "reasoning_content": None},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Process first chunk
|
||||
first_response = ModelResponseStream(**first_chunk)
|
||||
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
|
||||
first_response
|
||||
)
|
||||
print("first_response", json.dumps(first_response, indent=4, default=str))
|
||||
assert first_response.choices[0].delta.content == "<think>Let me think about this"
|
||||
# assert the response does not have attribute reasoning_content
|
||||
assert not hasattr(first_response.choices[0].delta, "reasoning_content")
|
||||
|
||||
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
|
||||
|
||||
# Process middle chunk
|
||||
middle_response = ModelResponseStream(**middle_chunk)
|
||||
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
|
||||
middle_response
|
||||
)
|
||||
print("middle_response", json.dumps(middle_response, indent=4, default=str))
|
||||
assert middle_response.choices[0].delta.content == " step by step"
|
||||
assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
|
||||
|
||||
# Process final chunk
|
||||
final_response = ModelResponseStream(**final_chunk)
|
||||
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
|
||||
final_response
|
||||
)
|
||||
print("final_response", json.dumps(final_response, indent=4, default=str))
|
||||
assert final_response.choices[0].delta.content == "</think>The answer is 42"
|
||||
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
|
||||
assert not hasattr(final_response.choices[0].delta, "reasoning_content")
|
||||
|
||||
|
||||
def test_multi_chunk_reasoning_and_content(
|
||||
initialized_custom_stream_wrapper: CustomStreamWrapper,
|
||||
):
|
||||
"""Test handling of multiple reasoning chunks followed by multiple content chunks"""
|
||||
# Setup the wrapper to use the merge feature
|
||||
initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
|
||||
initialized_custom_stream_wrapper.sent_first_thinking_block = False
|
||||
initialized_custom_stream_wrapper.sent_last_thinking_block = False
|
||||
|
||||
# Create test chunks
|
||||
chunks = [
|
||||
# Chunk 1: First reasoning chunk
|
||||
{
|
||||
"id": "chunk1",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037890,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {
|
||||
"content": "",
|
||||
"reasoning_content": "To solve this problem",
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
# Chunk 2: Second reasoning chunk
|
||||
{
|
||||
"id": "chunk2",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037891,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {
|
||||
"content": "",
|
||||
"reasoning_content": ", I need to calculate 6 * 7",
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
# Chunk 3: Third reasoning chunk
|
||||
{
|
||||
"id": "chunk3",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037892,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": "", "reasoning_content": " which equals 42"},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
# Chunk 4: First content chunk (transition from reasoning to content)
|
||||
{
|
||||
"id": "chunk4",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037893,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {
|
||||
"content": "The answer to your question",
|
||||
"reasoning_content": None,
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
# Chunk 5: Second content chunk
|
||||
{
|
||||
"id": "chunk5",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": 1741037894,
|
||||
"model": "deepseek-reasoner",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": " is 42.", "reasoning_content": None},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
# Expected content after processing each chunk
|
||||
expected_contents = [
|
||||
"<think>To solve this problem",
|
||||
", I need to calculate 6 * 7",
|
||||
" which equals 42",
|
||||
"</think>The answer to your question",
|
||||
" is 42.",
|
||||
]
|
||||
|
||||
# Process each chunk and verify results
|
||||
for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
|
||||
response = ModelResponseStream(**chunk)
|
||||
initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
|
||||
response
|
||||
)
|
||||
|
||||
# Check content
|
||||
assert (
|
||||
response.choices[0].delta.content == expected_content
|
||||
), f"Chunk {i+1}: content mismatch"
|
||||
|
||||
# Check reasoning_content was removed
|
||||
assert not hasattr(
|
||||
response.choices[0].delta, "reasoning_content"
|
||||
), f"Chunk {i+1}: reasoning_content should be removed"
|
||||
|
||||
# Verify final state
|
||||
assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
|
||||
assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
|
||||
|
|
|
@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
|
|||
"Alright, let's get started with resolving this issue about implementing"
|
||||
in json_data
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bedrock_stream_thinking_content_openwebui():
|
||||
"""
|
||||
When merge_reasoning_content_in_choices=True
|
||||
|
||||
The content should be collected as
|
||||
|
||||
```
|
||||
<think>
|
||||
I am a helpful assistant, the user wants to know who I am
|
||||
</think>
|
||||
|
||||
Hi I am Anthropic, I am a helpful assistant
|
||||
|
||||
```
|
||||
"""
|
||||
response = await litellm.acompletion(
|
||||
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
messages=[{"role": "user", "content": "Hello who is this?"}],
|
||||
stream=True,
|
||||
max_tokens=1080,
|
||||
thinking={"type": "enabled", "budget_tokens": 1024},
|
||||
merge_reasoning_content_in_choices=True,
|
||||
)
|
||||
content = ""
|
||||
async for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
|
||||
assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
|
||||
print(chunk)
|
||||
|
||||
print("collected content", content)
|
||||
|
||||
# Assert that the content follows the expected format with exactly one thinking section
|
||||
think_open_pos = content.find("<think>")
|
||||
think_close_pos = content.find("</think>")
|
||||
|
||||
# Assert there's exactly one opening and closing tag
|
||||
assert think_open_pos >= 0, "Opening <think> tag not found"
|
||||
assert think_close_pos > 0, "Closing </think> tag not found"
|
||||
assert (
|
||||
content.count("<think>") == 1
|
||||
), "There should be exactly one opening <think> tag"
|
||||
assert (
|
||||
content.count("</think>") == 1
|
||||
), "There should be exactly one closing </think> tag"
|
||||
|
||||
# Assert the opening tag comes before the closing tag
|
||||
assert (
|
||||
think_open_pos < think_close_pos
|
||||
), "Opening tag should come before closing tag"
|
||||
|
||||
# Assert there's content between the tags
|
||||
thinking_content = content[think_open_pos + 7 : think_close_pos]
|
||||
assert (
|
||||
len(thinking_content.strip()) > 0
|
||||
), "There should be content between thinking tags"
|
||||
|
||||
# Assert there's content after the closing tag
|
||||
assert (
|
||||
len(content) > think_close_pos + 8
|
||||
), "There should be content after the thinking tags"
|
||||
response_content = content[think_close_pos + 8 :].strip()
|
||||
assert (
|
||||
len(response_content) > 0
|
||||
), "There should be non-empty content after thinking tags"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue