Litellm dev 01 25 2025 p4 (#8006)

* feat(main.py): use asyncio.sleep for mock_Timeout=true on async request adds unit testing to ensure proxy does not fail if specific Openai requests hang (e.g. recent o1 outage) * fix(streaming_handler.py): fix deepseek r1 return reasoning content on streaming Fixes https://github.com/BerriAI/litellm/issues/7942 * Revert "fix(streaming_handler.py): fix deepseek r1 return reasoning content on streaming" This reverts commit 7a052a64e3. * fix(deepseek-r-1): return reasoning_content as a top-level param ensures compatibility with existing tools that use it * fix: fix linting error
2025-04-25 18:54:30 +00:00 · 2025-01-26 08:01:05 -08:00 · 2025-01-26 08:01:05 -08:00 · 6bafdbc546
commit 6bafdbc546
parent 03eef5a2a0
8 changed files with 108 additions and 19 deletions
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -471,6 +471,7 @@ class CustomStreamWrapper:
            finish_reason = None
            logprobs = None
            usage = None
            if str_line and str_line.choices and len(str_line.choices) > 0:
                if (
                    str_line.choices[0].delta is not None
@ -750,6 +751,7 @@ class CustomStreamWrapper:
                "function_call" in completion_obj
                and completion_obj["function_call"] is not None
            )
            or (model_response.choices[0].delta.provider_specific_fields is not None)
            or (
                "provider_specific_fields" in response_obj
                and response_obj["provider_specific_fields"] is not None
--- a/litellm/main.py
+++ b/litellm/main.py
@ -383,6 +383,10 @@ async def acompletion(
        - If `stream` is True, the function returns an async generator that yields completion lines.
    """
    fallbacks = kwargs.get("fallbacks", None)
    mock_timeout = kwargs.get("mock_timeout", None)
    if mock_timeout is True:
        await _handle_mock_timeout_async(mock_timeout, timeout, model)
    loop = asyncio.get_event_loop()
    custom_llm_provider = kwargs.get("custom_llm_provider", None)
@ -565,12 +569,7 @@ def _handle_mock_timeout(
    model: str,
 ):
    if mock_timeout is True and timeout is not None:
-        if isinstance(timeout, float):
+        _sleep_for_timeout(timeout)
            time.sleep(timeout)
        elif isinstance(timeout, str):
            time.sleep(float(timeout))
        elif isinstance(timeout, httpx.Timeout) and timeout.connect is not None:
            time.sleep(timeout.connect)
        raise litellm.Timeout(
            message="This is a mock timeout error",
            llm_provider="openai",
@ -578,6 +577,38 @@ def _handle_mock_timeout(
        )
 async def _handle_mock_timeout_async(
    mock_timeout: Optional[bool],
    timeout: Optional[Union[float, str, httpx.Timeout]],
    model: str,
 ):
    if mock_timeout is True and timeout is not None:
        await _sleep_for_timeout_async(timeout)
        raise litellm.Timeout(
            message="This is a mock timeout error",
            llm_provider="openai",
            model=model,
        )
 def _sleep_for_timeout(timeout: Union[float, str, httpx.Timeout]):
    if isinstance(timeout, float):
        time.sleep(timeout)
    elif isinstance(timeout, str):
        time.sleep(float(timeout))
    elif isinstance(timeout, httpx.Timeout) and timeout.connect is not None:
        time.sleep(timeout.connect)
 async def _sleep_for_timeout_async(timeout: Union[float, str, httpx.Timeout]):
    if isinstance(timeout, float):
        await asyncio.sleep(timeout)
    elif isinstance(timeout, str):
        await asyncio.sleep(float(timeout))
    elif isinstance(timeout, httpx.Timeout) and timeout.connect is not None:
        await asyncio.sleep(timeout.connect)
 def mock_completion(
    model: str,
    messages: List,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -15,4 +15,4 @@ model_list:
      model: anthropic.claude-3-sonnet-20240229-v1:0
 litellm_settings:
-  callbacks: ["langsmith"]
+  callbacks: ["langsmith"]
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -487,6 +487,8 @@ class Message(OpenAIObject):
        if provider_specific_fields:  # set if provider_specific_fields is not empty
            self.provider_specific_fields = provider_specific_fields
            for k, v in provider_specific_fields.items():
                setattr(self, k, v)
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -522,18 +524,18 @@ class Delta(OpenAIObject):
        audio: Optional[ChatCompletionAudioResponse] = None,
        **params,
    ):
        super(Delta, self).__init__(**params)
        provider_specific_fields: Dict[str, Any] = {}
        if "reasoning_content" in params:
            provider_specific_fields["reasoning_content"] = params["reasoning_content"]
-            del params["reasoning_content"]
+            setattr(self, "reasoning_content", params["reasoning_content"])
        super(Delta, self).__init__(**params)
        self.content = content
        self.role = role
        self.provider_specific_fields = provider_specific_fields
        # Set default values and correct types
        self.function_call: Optional[Union[FunctionCall, Any]] = None
        self.tool_calls: Optional[List[Union[ChatCompletionDeltaToolCall, Any]]] = None
        self.audio: Optional[ChatCompletionAudioResponse] = None
        if provider_specific_fields:  # set if provider_specific_fields is not empty
            self.provider_specific_fields = provider_specific_fields
@ -801,6 +803,7 @@ class StreamingChatCompletionChunk(OpenAIChatCompletionChunk):
            new_choice = StreamingChoices(**choice).model_dump()
            new_choices.append(new_choice)
        kwargs["choices"] = new_choices
        super().__init__(**kwargs)
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -97,6 +97,14 @@ model_list:
      rpm: 1000
    model_info:
      health_check_timeout: 1
  - model_name: good-model
    litellm_params:
      model: openai/bad-model
      api_key: os.environ/OPENAI_API_KEY
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      rpm: 1000
    model_info:
      health_check_timeout: 1
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@ -4546,10 +4546,7 @@ def test_deepseek_reasoning_content_completion():
        messages=[{"role": "user", "content": "Tell me a joke."}],
    )
-    assert (
+    assert resp.choices[0].message.reasoning_content is not None
        resp.choices[0].message.provider_specific_fields["reasoning_content"]
        is not None
    )
@pytest.mark.parametrize(
--- a/tests/local_testing/test_streaming.py
+++ b/tests/local_testing/test_streaming.py
@ -4066,7 +4066,7 @@ def test_mock_response_iterator_tool_use():
 def test_deepseek_reasoning_content_completion():
-    litellm.set_verbose = True
+    # litellm.set_verbose = True
    resp = litellm.completion(
        model="deepseek/deepseek-reasoner",
        messages=[{"role": "user", "content": "Tell me a joke."}],
@ -4076,8 +4076,7 @@ def test_deepseek_reasoning_content_completion():
    reasoning_content_exists = False
    for chunk in resp:
        print(f"chunk: {chunk}")
-        if chunk.choices[0].delta.content is not None:
+        if chunk.choices[0].delta.reasoning_content is not None:
-            if "reasoning_content" in chunk.choices[0].delta.provider_specific_fields:
+            reasoning_content_exists = True
-                reasoning_content_exists = True
+            break
                break
    assert reasoning_content_exists
--- a/tests/test_fallbacks.py
+++ b/tests/test_fallbacks.py
@ -228,3 +228,52 @@ async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
        except Exception as e:
            if has_access:
                pytest.fail("Expected this to work: {}".format(str(e)))
 import asyncio
 from openai import AsyncOpenAI
 from typing import List
 import time
 async def make_request(client: AsyncOpenAI, model: str) -> bool:
    try:
        await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Who was Alexander?"}],
        )
        return True
    except Exception as e:
        print(f"Error with {model}: {str(e)}")
        return False
 async def run_good_model_test(client: AsyncOpenAI, num_requests: int) -> bool:
    tasks = [make_request(client, "good-model") for _ in range(num_requests)]
    good_results = await asyncio.gather(*tasks)
    return all(good_results)
@pytest.mark.asyncio
 async def test_chat_completion_bad_and_good_model():
    """
    Prod test - ensure even if bad model is down, good model is still working.
    """
    client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000")
    num_requests = 100
    num_iterations = 3
    for iteration in range(num_iterations):
        print(f"\nIteration {iteration + 1}/{num_iterations}")
        start_time = time.time()
        # Fire and forget bad model requests
        for _ in range(num_requests):
            asyncio.create_task(make_request(client, "bad-model"))
        # Wait only for good model requests
        success = await run_good_model_test(client, num_requests)
        print(
            f"Iteration {iteration + 1}: {'✓' if success else '✗'} ({time.time() - start_time:.2f}s)"
        )
        assert success, "Not all good model requests succeeded"