Support openrouter reasoning_content on streaming (#9094)

* feat(convert_dict_to_response.py): support openrouter format of reasoning content * fix(transformation.py): fix openrouter streaming with reasoning content Fixes https://github.com/BerriAI/litellm/issues/8193#issuecomment-270892962 * fix: fix type error
2025-04-24 18:24:20 +00:00 · 2025-03-09 20:03:59 -07:00 · 2025-03-09 20:03:59 -07:00 · f899b828cf
commit f899b828cf
parent 42b7921ca1
6 changed files with 99 additions and 18 deletions
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -239,6 +239,24 @@ def _parse_content_for_reasoning(
    return None, message_text


+def _extract_reasoning_content(message: dict) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Extract reasoning content and main content from a message.
+
+    Args:
+        message (dict): The message dictionary that may contain reasoning_content
+
+    Returns:
+        tuple[Optional[str], Optional[str]]: A tuple of (reasoning_content, content)
+    """
+    if "reasoning_content" in message:
+        return message["reasoning_content"], message["content"]
+    elif "reasoning" in message:
+        return message["reasoning"], message["content"]
+    else:
+        return _parse_content_for_reasoning(message.get("content"))
+
+
 class LiteLLMResponseObjectHandler:

    @staticmethod
@ -452,13 +470,9 @@ def convert_to_model_response_object(  # noqa: PLR0915
                            provider_specific_fields[field] = choice["message"][field]

                    # Handle reasoning models that display `reasoning_content` within `content`
-                    if "reasoning_content" in choice["message"]:
-                        reasoning_content = choice["message"]["reasoning_content"]
-                        content = choice["message"]["content"]
-                    else:
-                        reasoning_content, content = _parse_content_for_reasoning(
-                            choice["message"].get("content")
-                        )
+                    reasoning_content, content = _extract_reasoning_content(
+                        choice["message"]
+                    )

                    # Handle thinking models that display `thinking_blocks` within `content`
                    thinking_blocks: Optional[List[ChatCompletionThinkingBlock]] = None
--- a/litellm/llms/openrouter/chat/transformation.py
+++ b/litellm/llms/openrouter/chat/transformation.py
@ -6,7 +6,16 @@ Calls done in OpenAI/openai.py as OpenRouter is openai-compatible.
 Docs: https://openrouter.ai/docs/parameters
 """

+from typing import Any, AsyncIterator, Iterator, Optional, Union
+
+import httpx
+
+from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
+from litellm.llms.base_llm.chat.transformation import BaseLLMException
+from litellm.types.utils import ModelResponse, ModelResponseStream
+
 from ...openai.chat.gpt_transformation import OpenAIGPTConfig
+from ..common_utils import OpenRouterException


 class OpenrouterConfig(OpenAIGPTConfig):
@ -37,3 +46,43 @@ class OpenrouterConfig(OpenAIGPTConfig):
            extra_body  # openai client supports `extra_body` param
        )
        return mapped_openai_params
+
+    def get_error_class(
+        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
+    ) -> BaseLLMException:
+        return OpenRouterException(
+            message=error_message,
+            status_code=status_code,
+            headers=headers,
+        )
+
+    def get_model_response_iterator(
+        self,
+        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
+        sync_stream: bool,
+        json_mode: Optional[bool] = False,
+    ) -> Any:
+        return OpenRouterChatCompletionStreamingHandler(
+            streaming_response=streaming_response,
+            sync_stream=sync_stream,
+            json_mode=json_mode,
+        )
+
+
+class OpenRouterChatCompletionStreamingHandler(BaseModelResponseIterator):
+
+    def chunk_parser(self, chunk: dict) -> ModelResponseStream:
+        try:
+            new_choices = []
+            for choice in chunk["choices"]:
+                choice["delta"]["reasoning_content"] = choice["delta"].get("reasoning")
+                new_choices.append(choice)
+            return ModelResponseStream(
+                id=chunk["id"],
+                object="chat.completion.chunk",
+                created=chunk["created"],
+                model=chunk["model"],
+                choices=new_choices,
+            )
+        except Exception as e:
+            raise e
--- a/litellm/llms/openrouter/common_utils.py
+++ b/litellm/llms/openrouter/common_utils.py
@ -0,0 +1,5 @@
+from litellm.llms.base_llm.chat.transformation import BaseLLMException
+
+
+class OpenRouterException(BaseLLMException):
+    pass
--- a/litellm/main.py
+++ b/litellm/main.py
@ -2274,23 +2274,22 @@ def completion(  # type: ignore # noqa: PLR0915
            data = {"model": model, "messages": messages, **optional_params}

            ## COMPLETION CALL
-            response = openai_like_chat_completion.completion(
+            response = base_llm_http_handler.completion(
                model=model,
+                stream=stream,
                messages=messages,
-                headers=headers,
-                api_key=api_key,
+                acompletion=acompletion,
                api_base=api_base,
                model_response=model_response,
-                print_verbose=print_verbose,
                optional_params=optional_params,
                litellm_params=litellm_params,
-                logger_fn=logger_fn,
-                logging_obj=logging,
-                acompletion=acompletion,
-                timeout=timeout,  # type: ignore
                custom_llm_provider="openrouter",
-                custom_prompt_dict=custom_prompt_dict,
+                timeout=timeout,
+                headers=headers,
                encoding=encoding,
+                api_key=api_key,
+                logging_obj=logging,  # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
+                client=client,
            )
            ## LOGGING
            logging.post_call(
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@ -4830,3 +4830,14 @@ def test_completion_gpt_4o_empty_str():
            messages=[{"role": "user", "content": ""}],
        )
        assert resp.choices[0].message.content is not None
+
+
+def test_completion_openrouter_reasoning_content():
+    litellm._turn_on_debug()
+    resp = litellm.completion(
+        model="openrouter/anthropic/claude-3.7-sonnet",
+        messages=[{"role": "user", "content": "Hello world"}],
+        reasoning={"effort": "high"},
+    )
+    print(resp)
+    assert resp.choices[0].message.reasoning_content is not None
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@ -4069,7 +4069,8 @@ def test_mock_response_iterator_tool_use():
    "model",
    [
        # "deepseek/deepseek-reasoner",
-        "anthropic/claude-3-7-sonnet-20250219",
+        # "anthropic/claude-3-7-sonnet-20250219",
+        "openrouter/anthropic/claude-3.7-sonnet",
    ],
 )
 def test_reasoning_content_completion(model):
@ -4080,7 +4081,9 @@ def test_reasoning_content_completion(model):
            model=model,
            messages=[{"role": "user", "content": "Tell me a joke."}],
            stream=True,
-            thinking={"type": "enabled", "budget_tokens": 1024},
+            # thinking={"type": "enabled", "budget_tokens": 1024},
+            reasoning={"effort": "high"},
+            drop_params=True,
        )

        reasoning_content_exists = False