diff --git a/litellm/__init__.py b/litellm/__init__.py index 9ca1517c92..d66707f8b3 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -277,8 +277,6 @@ disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None custom_prometheus_metadata_labels: List[str] = [] #### REQUEST PRIORITIZATION #### priority_reservation: Optional[Dict[str, float]] = None - - force_ipv4: bool = ( False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. ) diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py index c0fbb1cb97..cf62375f33 100644 --- a/litellm/litellm_core_utils/get_litellm_params.py +++ b/litellm/litellm_core_utils/get_litellm_params.py @@ -57,6 +57,7 @@ def get_litellm_params( prompt_variables: Optional[dict] = None, async_call: Optional[bool] = None, ssl_verify: Optional[bool] = None, + merge_reasoning_content_in_choices: Optional[bool] = None, **kwargs, ) -> dict: litellm_params = { @@ -97,5 +98,6 @@ def get_litellm_params( "prompt_variables": prompt_variables, "async_call": async_call, "ssl_verify": ssl_verify, + "merge_reasoning_content_in_choices": merge_reasoning_content_in_choices, } return litellm_params diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py index 4ce27bfeca..2c7af8d5ba 100644 --- a/litellm/litellm_core_utils/streaming_handler.py +++ b/litellm/litellm_core_utils/streaming_handler.py @@ -15,6 +15,7 @@ from litellm import verbose_logger from litellm.litellm_core_utils.redact_messages import LiteLLMLoggingObject from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.types.llms.openai import ChatCompletionChunk +from litellm.types.router import GenericLiteLLMParams from litellm.types.utils import Delta from litellm.types.utils import GenericStreamingChunk as GChunk from litellm.types.utils import ( @@ -70,6 +71,17 @@ class CustomStreamWrapper: self.completion_stream = completion_stream self.sent_first_chunk = False self.sent_last_chunk = False + + litellm_params: GenericLiteLLMParams = GenericLiteLLMParams( + **self.logging_obj.model_call_details.get("litellm_params", {}) + ) + self.merge_reasoning_content_in_choices: bool = ( + litellm_params.merge_reasoning_content_in_choices or False + ) + self.sent_first_thinking_block = False + self.sent_last_thinking_block = False + self.thinking_content = "" + self.system_fingerprint: Optional[str] = None self.received_finish_reason: Optional[str] = None self.intermittent_finish_reason: Optional[str] = ( @@ -87,12 +99,7 @@ class CustomStreamWrapper: self.holding_chunk = "" self.complete_response = "" self.response_uptil_now = "" - _model_info = ( - self.logging_obj.model_call_details.get("litellm_params", {}).get( - "model_info", {} - ) - or {} - ) + _model_info: Dict = litellm_params.model_info or {} _api_base = get_api_base( model=model or "", @@ -873,6 +880,10 @@ class CustomStreamWrapper: _index: Optional[int] = completion_obj.get("index") if _index is not None: model_response.choices[0].index = _index + + self._optional_combine_thinking_block_in_choices( + model_response=model_response + ) print_verbose(f"returning model_response: {model_response}") return model_response else: @@ -929,6 +940,48 @@ class CustomStreamWrapper: self.chunks.append(model_response) return + def _optional_combine_thinking_block_in_choices( + self, model_response: ModelResponseStream + ) -> None: + """ + UI's Like OpenWebUI expect to get 1 chunk with ... tags in the chunk content + + In place updates the model_response object with reasoning_content in content with ... tags + + Enabled when `merge_reasoning_content_in_choices=True` passed in request params + + + """ + if self.merge_reasoning_content_in_choices is True: + reasoning_content = getattr( + model_response.choices[0].delta, "reasoning_content", None + ) + if reasoning_content: + if self.sent_first_thinking_block is False: + model_response.choices[0].delta.content += ( + "" + reasoning_content + ) + self.sent_first_thinking_block = True + elif ( + self.sent_first_thinking_block is True + and hasattr(model_response.choices[0].delta, "reasoning_content") + and model_response.choices[0].delta.reasoning_content + ): + model_response.choices[0].delta.content = reasoning_content + elif ( + self.sent_first_thinking_block is True + and not self.sent_last_thinking_block + and model_response.choices[0].delta.content + ): + model_response.choices[0].delta.content = ( + "" + model_response.choices[0].delta.content + ) + self.sent_last_thinking_block = True + + if hasattr(model_response.choices[0].delta, "reasoning_content"): + del model_response.choices[0].delta.reasoning_content + return + def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915 model_response = self.model_response_creator() response_obj: Dict[str, Any] = {} diff --git a/litellm/main.py b/litellm/main.py index 28dbf45102..1699e79cf7 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1159,6 +1159,9 @@ def completion( # type: ignore # noqa: PLR0915 prompt_id=prompt_id, prompt_variables=prompt_variables, ssl_verify=ssl_verify, + merge_reasoning_content_in_choices=kwargs.get( + "merge_reasoning_content_in_choices", None + ), ) logging.update_environment_variables( model=model, diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index eef4a55ed3..258eef6307 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -1,27 +1,13 @@ model_list: - - model_name: fake-openai-endpoint + - model_name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 litellm_params: - model: openai/my-fake-model - api_key: my-fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - - model_name: claude-special-alias - litellm_params: - model: anthropic/claude-3-haiku-20240307 - api_key: os.environ/ANTHROPIC_API_KEY - - model_name: claude-3-5-sonnet-20241022 - litellm_params: - model: anthropic/claude-3-5-sonnet-20241022 - api_key: os.environ/ANTHROPIC_API_KEY - - model_name: claude-3-7-sonnet-20250219 - litellm_params: - model: anthropic/claude-3-7-sonnet-20250219 - api_key: os.environ/ANTHROPIC_API_KEY - - model_name: anthropic/* - litellm_params: - model: anthropic/* - api_key: os.environ/ANTHROPIC_API_KEY + + model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 + thinking: {"type": "enabled", "budget_tokens": 1024} + max_tokens: 1080 + merge_reasoning_content_in_choices: true + general_settings: store_model_in_db: true store_prompts_in_spend_logs: true - diff --git a/litellm/types/router.py b/litellm/types/router.py index e2c92783da..9a5fb168da 100644 --- a/litellm/types/router.py +++ b/litellm/types/router.py @@ -192,6 +192,8 @@ class GenericLiteLLMParams(BaseModel): budget_duration: Optional[str] = None use_in_pass_through: Optional[bool] = False model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) + merge_reasoning_content_in_choices: Optional[bool] = False + model_info: Optional[Dict] = None def __init__( self, @@ -231,6 +233,9 @@ class GenericLiteLLMParams(BaseModel): budget_duration: Optional[str] = None, # Pass through params use_in_pass_through: Optional[bool] = False, + # This will merge the reasoning content in the choices + merge_reasoning_content_in_choices: Optional[bool] = False, + model_info: Optional[Dict] = None, **params, ): args = locals() diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 894ef70933..5d1bef2762 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -21,6 +21,8 @@ from openai.types.moderation_create_response import Moderation, ModerationCreate from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from typing_extensions import Callable, Dict, Required, TypedDict, override +import litellm + from ..litellm_core_utils.core_helpers import map_finish_reason from .guardrails import GuardrailEventHooks from .llms.openai import ( @@ -1803,6 +1805,7 @@ all_litellm_params = [ "max_budget", "budget_duration", "use_in_pass_through", + "merge_reasoning_content_in_choices", ] + list(StandardCallbackDynamicParams.__annotations__.keys()) diff --git a/tests/litellm/litellm_core_utils/test_streaming_handler.py b/tests/litellm/litellm_core_utils/test_streaming_handler.py index 7595c19155..54d178e3ac 100644 --- a/tests/litellm/litellm_core_utils/test_streaming_handler.py +++ b/tests/litellm/litellm_core_utils/test_streaming_handler.py @@ -46,3 +46,213 @@ def test_is_chunk_non_empty(initialized_custom_stream_wrapper: CustomStreamWrapp model_response=ModelResponseStream(**chunk), response_obj=MagicMock(), ) + + +def test_optional_combine_thinking_block_in_choices( + initialized_custom_stream_wrapper: CustomStreamWrapper, +): + """Test that reasoning_content is properly combined with content using tags""" + # Setup the wrapper to use the merge feature + initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True + + # First chunk with reasoning_content - should add tag + first_chunk = { + "id": "chunk1", + "object": "chat.completion.chunk", + "created": 1741037890, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": { + "content": "", + "reasoning_content": "Let me think about this", + }, + "finish_reason": None, + } + ], + } + + # Middle chunk with more reasoning_content + middle_chunk = { + "id": "chunk2", + "object": "chat.completion.chunk", + "created": 1741037891, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": {"content": "", "reasoning_content": " step by step"}, + "finish_reason": None, + } + ], + } + + # Final chunk with actual content - should add tag + final_chunk = { + "id": "chunk3", + "object": "chat.completion.chunk", + "created": 1741037892, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": {"content": "The answer is 42", "reasoning_content": None}, + "finish_reason": None, + } + ], + } + + # Process first chunk + first_response = ModelResponseStream(**first_chunk) + initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices( + first_response + ) + print("first_response", json.dumps(first_response, indent=4, default=str)) + assert first_response.choices[0].delta.content == "Let me think about this" + # assert the response does not have attribute reasoning_content + assert not hasattr(first_response.choices[0].delta, "reasoning_content") + + assert initialized_custom_stream_wrapper.sent_first_thinking_block is True + + # Process middle chunk + middle_response = ModelResponseStream(**middle_chunk) + initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices( + middle_response + ) + print("middle_response", json.dumps(middle_response, indent=4, default=str)) + assert middle_response.choices[0].delta.content == " step by step" + assert not hasattr(middle_response.choices[0].delta, "reasoning_content") + + # Process final chunk + final_response = ModelResponseStream(**final_chunk) + initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices( + final_response + ) + print("final_response", json.dumps(final_response, indent=4, default=str)) + assert final_response.choices[0].delta.content == "The answer is 42" + assert initialized_custom_stream_wrapper.sent_last_thinking_block is True + assert not hasattr(final_response.choices[0].delta, "reasoning_content") + + +def test_multi_chunk_reasoning_and_content( + initialized_custom_stream_wrapper: CustomStreamWrapper, +): + """Test handling of multiple reasoning chunks followed by multiple content chunks""" + # Setup the wrapper to use the merge feature + initialized_custom_stream_wrapper.merge_reasoning_content_in_choices = True + initialized_custom_stream_wrapper.sent_first_thinking_block = False + initialized_custom_stream_wrapper.sent_last_thinking_block = False + + # Create test chunks + chunks = [ + # Chunk 1: First reasoning chunk + { + "id": "chunk1", + "object": "chat.completion.chunk", + "created": 1741037890, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": { + "content": "", + "reasoning_content": "To solve this problem", + }, + "finish_reason": None, + } + ], + }, + # Chunk 2: Second reasoning chunk + { + "id": "chunk2", + "object": "chat.completion.chunk", + "created": 1741037891, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": { + "content": "", + "reasoning_content": ", I need to calculate 6 * 7", + }, + "finish_reason": None, + } + ], + }, + # Chunk 3: Third reasoning chunk + { + "id": "chunk3", + "object": "chat.completion.chunk", + "created": 1741037892, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": {"content": "", "reasoning_content": " which equals 42"}, + "finish_reason": None, + } + ], + }, + # Chunk 4: First content chunk (transition from reasoning to content) + { + "id": "chunk4", + "object": "chat.completion.chunk", + "created": 1741037893, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": { + "content": "The answer to your question", + "reasoning_content": None, + }, + "finish_reason": None, + } + ], + }, + # Chunk 5: Second content chunk + { + "id": "chunk5", + "object": "chat.completion.chunk", + "created": 1741037894, + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "delta": {"content": " is 42.", "reasoning_content": None}, + "finish_reason": None, + } + ], + }, + ] + + # Expected content after processing each chunk + expected_contents = [ + "To solve this problem", + ", I need to calculate 6 * 7", + " which equals 42", + "The answer to your question", + " is 42.", + ] + + # Process each chunk and verify results + for i, (chunk, expected_content) in enumerate(zip(chunks, expected_contents)): + response = ModelResponseStream(**chunk) + initialized_custom_stream_wrapper._optional_combine_thinking_block_in_choices( + response + ) + + # Check content + assert ( + response.choices[0].delta.content == expected_content + ), f"Chunk {i+1}: content mismatch" + + # Check reasoning_content was removed + assert not hasattr( + response.choices[0].delta, "reasoning_content" + ), f"Chunk {i+1}: reasoning_content should be removed" + + # Verify final state + assert initialized_custom_stream_wrapper.sent_first_thinking_block is True + assert initialized_custom_stream_wrapper.sent_last_thinking_block is True diff --git a/tests/llm_translation/test_bedrock_completion.py b/tests/llm_translation/test_bedrock_completion.py index 660da72f24..cc8cc163d4 100644 --- a/tests/llm_translation/test_bedrock_completion.py +++ b/tests/llm_translation/test_bedrock_completion.py @@ -2841,3 +2841,72 @@ async def test_bedrock_thinking_in_assistant_message(sync_mode): "Alright, let's get started with resolving this issue about implementing" in json_data ) + + +@pytest.mark.asyncio +async def test_bedrock_stream_thinking_content_openwebui(): + """ + When merge_reasoning_content_in_choices=True + + The content should be collected as + + ``` + + I am a helpful assistant, the user wants to know who I am + + + Hi I am Anthropic, I am a helpful assistant + + ``` + """ + response = await litellm.acompletion( + model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", + messages=[{"role": "user", "content": "Hello who is this?"}], + stream=True, + max_tokens=1080, + thinking={"type": "enabled", "budget_tokens": 1024}, + merge_reasoning_content_in_choices=True, + ) + content = "" + async for chunk in response: + content += chunk.choices[0].delta.content or "" + + # OpenWebUI expects the reasoning_content to be removed, otherwise this will appear as duplicate thinking blocks + assert getattr(chunk.choices[0].delta, "reasoning_content", None) is None + print(chunk) + + print("collected content", content) + + # Assert that the content follows the expected format with exactly one thinking section + think_open_pos = content.find("") + think_close_pos = content.find("") + + # Assert there's exactly one opening and closing tag + assert think_open_pos >= 0, "Opening tag not found" + assert think_close_pos > 0, "Closing tag not found" + assert ( + content.count("") == 1 + ), "There should be exactly one opening tag" + assert ( + content.count("") == 1 + ), "There should be exactly one closing tag" + + # Assert the opening tag comes before the closing tag + assert ( + think_open_pos < think_close_pos + ), "Opening tag should come before closing tag" + + # Assert there's content between the tags + thinking_content = content[think_open_pos + 7 : think_close_pos] + assert ( + len(thinking_content.strip()) > 0 + ), "There should be content between thinking tags" + + # Assert there's content after the closing tag + assert ( + len(content) > think_close_pos + 8 + ), "There should be content after the thinking tags" + response_content = content[think_close_pos + 8 :].strip() + assert ( + len(response_content) > 0 + ), "There should be non-empty content after thinking tags"