diff --git a/litellm/__init__.py b/litellm/__init__.py
index 9ca1517c92..d66707f8b3 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
custom_prometheus_metadata_labels: List[str] = []
#### REQUEST PRIORITIZATION ####
priority_reservation: Optional[Dict[str, float]] = None
-
-
force_ipv4: bool = (
False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
)
diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py
index c0fbb1cb97..cf62375f33 100644
--- a/litellm/litellm_core_utils/get_litellm_params.py
+++ b/litellm/litellm_core_utils/get_litellm_params.py
@@ -57,6 +57,7 @@ def get_litellm_params(
prompt_variables: Optional[dict] = None,
async_call: Optional[bool] = None,
ssl_verify: Optional[bool] = None,
+ merge_reasoning_content_in_choices: Optional[bool] = None,
**kwargs,
) -> dict:
litellm_params = {
@@ -97,5 +98,6 @@ def get_litellm_params(
"prompt_variables": prompt_variables,
"async_call": async_call,
"ssl_verify": ssl_verify,
+ "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices,
}
return litellm_params
diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py
index 4ce27bfeca..2c7af8d5ba 100644
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@@ -15,6 +15,7 @@ from litellm import verbose_logger
from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject
from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.types.llms.openai import ChatCompletionChunk
+from litellm.types.router import GenericLiteLLMParams
from litellm.types.utils import Delta
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import (
@@ -70,6 +71,17 @@ class CustomStreamWrapper:
self.completion_stream = completion_stream
self.sent_first_chunk = False
self.sent_last_chunk = False
+
+ litellm_params: GenericLiteLLMParams = GenericLiteLLMParams(
+ **self.logging_obj.model_call_details.get("litellm_params", {})
+ )
+ self.merge_reasoning_content_in_choices: bool = (
+ litellm_params.merge_reasoning_content_in_choices or False
+ )
+ self.sent_first_thinking_block = False
+ self.sent_last_thinking_block = False
+ self.thinking_content = ""
+
self.system_fingerprint: Optional[str] = None
self.received_finish_reason: Optional[str] = None
self.intermittent_finish_reason: Optional[str] = (
@@ -87,12 +99,7 @@ class CustomStreamWrapper:
self.holding_chunk = ""
self.complete_response = ""
self.response_uptil_now = ""
- _model_info = (
- self.logging_obj.model_call_details.get("litellm_params", {}).get(
- "model_info", {}
- )
- or {}
- )
+ _model_info: Dict = litellm_params.model_info or {}
_api_base = get_api_base(
model=model or "",
@@ -873,6 +880,10 @@ class CustomStreamWrapper:
_index: Optional[int] = completion_obj.get("index")
if _index is not None:
model_response.choices[0].index = _index
+
+ self._optional_combine_thinking_block_in_choices(
+ model_response=model_response
+ )
print_verbose(f"returning model_response: {model_response}")
return model_response
else:
@@ -929,6 +940,48 @@ class CustomStreamWrapper:
self.chunks.append(model_response)
return
+ def _optional_combine_thinking_block_in_choices(
+ self, model_response: ModelResponseStream
+ ) -> None:
+ """
+ UI's Like OpenWebUI expect to get 1 chunk with ... tags in the chunk content
+
+ In place updates the model_response object with reasoning_content in content with ... tags
+
+ Enabled when `merge_reasoning_content_in_choices=True` passed in request params
+
+
+ """
+ if self.merge_reasoning_content_in_choices is True:
+ reasoning_content = getattr(
+ model_response.choices[0].delta, "reasoning_content", None
+ )
+ if reasoning_content:
+ if self.sent_first_thinking_block is False:
+ model_response.choices[0].delta.content += (
+ "" + reasoning_content
+ )
+ self.sent_first_thinking_block = True
+ elif (
+ self.sent_first_thinking_block is True
+ and hasattr(model_response.choices[0].delta, "reasoning_content")
+ and model_response.choices[0].delta.reasoning_content
+ ):
+ model_response.choices[0].delta.content = reasoning_content
+ elif (
+ self.sent_first_thinking_block is True
+ and not self.sent_last_thinking_block
+ and model_response.choices[0].delta.content
+ ):
+ model_response.choices[0].delta.content = (
+ "" + model_response.choices[0].delta.content
+ )
+ self.sent_last_thinking_block = True
+
+ if hasattr(model_response.choices[0].delta, "reasoning_content"):
+ del model_response.choices[0].delta.reasoning_content
+ return
+
def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915
model_response = self.model_response_creator()
response_obj: Dict[str, Any] = {}
diff --git a/litellm/main.py b/litellm/main.py
index 28dbf45102..1699e79cf7 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -1159,6 +1159,9 @@ def completion( # type: ignore # noqa: PLR0915
prompt_id=prompt_id,
prompt_variables=prompt_variables,
ssl_verify=ssl_verify,
+ merge_reasoning_content_in_choices=kwargs.get(
+ "merge_reasoning_content_in_choices", None
+ ),
)
logging.update_environment_variables(
model=model,
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index eef4a55ed3..258eef6307 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -1,27 +1,13 @@
model_list:
- - model_name: fake-openai-endpoint
+ - model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
litellm_params:
- model: openai/my-fake-model
- api_key: my-fake-key
- api_base: https://exampleopenaiendpoint-production.up.railway.app/
- - model_name: claude-special-alias
- litellm_params:
- model: anthropic/claude-3-haiku-20240307
- api_key: os.environ/ANTHROPIC_API_KEY
- - model_name: claude-3-5-sonnet-20241022
- litellm_params:
- model: anthropic/claude-3-5-sonnet-20241022
- api_key: os.environ/ANTHROPIC_API_KEY
- - model_name: claude-3-7-sonnet-20250219
- litellm_params:
- model: anthropic/claude-3-7-sonnet-20250219
- api_key: os.environ/ANTHROPIC_API_KEY
- - model_name: anthropic/*
- litellm_params:
- model: anthropic/*
- api_key: os.environ/ANTHROPIC_API_KEY
+
+ model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
+ thinking: {"type": "enabled", "budget_tokens": 1024}
+ max_tokens: 1080
+ merge_reasoning_content_in_choices: true
+
general_settings:
store_model_in_db: true
store_prompts_in_spend_logs: true
-
diff --git a/litellm/types/router.py b/litellm/types/router.py
index e2c92783da..9a5fb168da 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None
use_in_pass_through: Optional[bool] = False
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
+ merge_reasoning_content_in_choices: Optional[bool] = False
+ model_info: Optional[Dict] = None
def __init__(
self,
@@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel):
budget_duration: Optional[str] = None,
# Pass through params
use_in_pass_through: Optional[bool] = False,
+ # This will merge the reasoning content in the choices
+ merge_reasoning_content_in_choices: Optional[bool] = False,
+ model_info: Optional[Dict] = None,
**params,
):
args = locals()
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 894ef70933..5d1bef2762 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from typing_extensions import Callable, Dict, Required, TypedDict, override
+import litellm
+
from ..litellm_core_utils.core_helpers import map_finish_reason
from .guardrails import GuardrailEventHooks
from .llms.openai import (
@@ -1803,6 +1805,7 @@ all_litellm_params = [
"max_budget",
"budget_duration",
"use_in_pass_through",
+ "merge_reasoning_content_in_choices",
] + list(StandardCallbackDynamicParams.__annotations__.keys())
diff --git a/tests/litellm/litellm_core_utils/test_streaming_handler.py b/tests/litellm/litellm_core_utils/test_streaming_handler.py
index 7595c19155..54d178e3ac 100644
--- a/tests/litellm/litellm_core_utils/test_streaming_handler.py
+++ b/tests/litellm/litellm_core_utils/test_streaming_handler.py
@@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp
model_response=ModelResponseStream(**chunk),
response_obj=MagicMock(),
)
+
+
+def test_optional_combine_thinking_block_in_choices(
+ initialized_custom_stream_wrapper: CustomStreamWrapper,
+):
+ """Test that reasoning_content is properly combined with content using tags"""
+ # Setup the wrapper to use the merge feature
+ initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
+
+ # First chunk with reasoning_content - should add tag
+ first_chunk = {
+ "id": "chunk1",
+ "object": "chat.completion.chunk",
+ "created": 1741037890,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {
+ "content": "",
+ "reasoning_content": "Let me think about this",
+ },
+ "finish_reason": None,
+ }
+ ],
+ }
+
+ # Middle chunk with more reasoning_content
+ middle_chunk = {
+ "id": "chunk2",
+ "object": "chat.completion.chunk",
+ "created": 1741037891,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {"content": "", "reasoning_content": " step by step"},
+ "finish_reason": None,
+ }
+ ],
+ }
+
+ # Final chunk with actual content - should add tag
+ final_chunk = {
+ "id": "chunk3",
+ "object": "chat.completion.chunk",
+ "created": 1741037892,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {"content": "The answer is 42", "reasoning_content": None},
+ "finish_reason": None,
+ }
+ ],
+ }
+
+ # Process first chunk
+ first_response = ModelResponseStream(**first_chunk)
+ initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+ first_response
+ )
+ print("first_response", json.dumps(first_response, indent=4, default=str))
+ assert first_response.choices[0].delta.content == "Let me think about this"
+ # assert the response does not have attribute reasoning_content
+ assert not hasattr(first_response.choices[0].delta, "reasoning_content")
+
+ assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
+
+ # Process middle chunk
+ middle_response = ModelResponseStream(**middle_chunk)
+ initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+ middle_response
+ )
+ print("middle_response", json.dumps(middle_response, indent=4, default=str))
+ assert middle_response.choices[0].delta.content == " step by step"
+ assert not hasattr(middle_response.choices[0].delta, "reasoning_content")
+
+ # Process final chunk
+ final_response = ModelResponseStream(**final_chunk)
+ initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+ final_response
+ )
+ print("final_response", json.dumps(final_response, indent=4, default=str))
+ assert final_response.choices[0].delta.content == "The answer is 42"
+ assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
+ assert not hasattr(final_response.choices[0].delta, "reasoning_content")
+
+
+def test_multi_chunk_reasoning_and_content(
+ initialized_custom_stream_wrapper: CustomStreamWrapper,
+):
+ """Test handling of multiple reasoning chunks followed by multiple content chunks"""
+ # Setup the wrapper to use the merge feature
+ initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True
+ initialized_custom_stream_wrapper.sent_first_thinking_block = False
+ initialized_custom_stream_wrapper.sent_last_thinking_block = False
+
+ # Create test chunks
+ chunks = [
+ # Chunk 1: First reasoning chunk
+ {
+ "id": "chunk1",
+ "object": "chat.completion.chunk",
+ "created": 1741037890,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {
+ "content": "",
+ "reasoning_content": "To solve this problem",
+ },
+ "finish_reason": None,
+ }
+ ],
+ },
+ # Chunk 2: Second reasoning chunk
+ {
+ "id": "chunk2",
+ "object": "chat.completion.chunk",
+ "created": 1741037891,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {
+ "content": "",
+ "reasoning_content": ", I need to calculate 6 * 7",
+ },
+ "finish_reason": None,
+ }
+ ],
+ },
+ # Chunk 3: Third reasoning chunk
+ {
+ "id": "chunk3",
+ "object": "chat.completion.chunk",
+ "created": 1741037892,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {"content": "", "reasoning_content": " which equals 42"},
+ "finish_reason": None,
+ }
+ ],
+ },
+ # Chunk 4: First content chunk (transition from reasoning to content)
+ {
+ "id": "chunk4",
+ "object": "chat.completion.chunk",
+ "created": 1741037893,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {
+ "content": "The answer to your question",
+ "reasoning_content": None,
+ },
+ "finish_reason": None,
+ }
+ ],
+ },
+ # Chunk 5: Second content chunk
+ {
+ "id": "chunk5",
+ "object": "chat.completion.chunk",
+ "created": 1741037894,
+ "model": "deepseek-reasoner",
+ "choices": [
+ {
+ "index": 0,
+ "delta": {"content": " is 42.", "reasoning_content": None},
+ "finish_reason": None,
+ }
+ ],
+ },
+ ]
+
+ # Expected content after processing each chunk
+ expected_contents = [
+ "To solve this problem",
+ ", I need to calculate 6 * 7",
+ " which equals 42",
+ "The answer to your question",
+ " is 42.",
+ ]
+
+ # Process each chunk and verify results
+ for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)):
+ response = ModelResponseStream(**chunk)
+ initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices(
+ response
+ )
+
+ # Check content
+ assert (
+ response.choices[0].delta.content == expected_content
+ ), f"Chunk {i+1}: content mismatch"
+
+ # Check reasoning_content was removed
+ assert not hasattr(
+ response.choices[0].delta, "reasoning_content"
+ ), f"Chunk {i+1}: reasoning_content should be removed"
+
+ # Verify final state
+ assert initialized_custom_stream_wrapper.sent_first_thinking_block is True
+ assert initialized_custom_stream_wrapper.sent_last_thinking_block is True
diff --git a/tests/llm_translation/test_bedrock_completion.py b/tests/llm_translation/test_bedrock_completion.py
index 660da72f24..cc8cc163d4 100644
--- a/tests/llm_translation/test_bedrock_completion.py
+++ b/tests/llm_translation/test_bedrock_completion.py
@@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode):
"Alright, let's get started with resolving this issue about implementing"
in json_data
)
+
+
+@pytest.mark.asyncio
+async def test_bedrock_stream_thinking_content_openwebui():
+ """
+ When merge_reasoning_content_in_choices=True
+
+ The content should be collected as
+
+ ```
+
+ I am a helpful assistant, the user wants to know who I am
+
+
+ Hi I am Anthropic, I am a helpful assistant
+
+ ```
+ """
+ response = await litellm.acompletion(
+ model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+ messages=[{"role": "user", "content": "Hello who is this?"}],
+ stream=True,
+ max_tokens=1080,
+ thinking={"type": "enabled", "budget_tokens": 1024},
+ merge_reasoning_content_in_choices=True,
+ )
+ content = ""
+ async for chunk in response:
+ content += chunk.choices[0].delta.content or ""
+
+ # OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks
+ assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None
+ print(chunk)
+
+ print("collected content", content)
+
+ # Assert that the content follows the expected format with exactly one thinking section
+ think_open_pos = content.find("")
+ think_close_pos = content.find("")
+
+ # Assert there's exactly one opening and closing tag
+ assert think_open_pos >= 0, "Opening tag not found"
+ assert think_close_pos > 0, "Closing tag not found"
+ assert (
+ content.count("") == 1
+ ), "There should be exactly one opening tag"
+ assert (
+ content.count("") == 1
+ ), "There should be exactly one closing tag"
+
+ # Assert the opening tag comes before the closing tag
+ assert (
+ think_open_pos < think_close_pos
+ ), "Opening tag should come before closing tag"
+
+ # Assert there's content between the tags
+ thinking_content = content[think_open_pos + 7 : think_close_pos]
+ assert (
+ len(thinking_content.strip()) > 0
+ ), "There should be content between thinking tags"
+
+ # Assert there's content after the closing tag
+ assert (
+ len(content) > think_close_pos + 8
+ ), "There should be content after the thinking tags"
+ response_content = content[think_close_pos + 8 :].strip()
+ assert (
+ len(response_content) > 0
+ ), "There should be non-empty content after thinking tags"