From c399232b2ce379dd09e644a63bc3aa2680318ade Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Thu, 13 Feb 2025 17:10:22 -0800
Subject: [PATCH] =?UTF-8?q?fix(router.py):=20add=20more=20deployment=20tim?=
 =?UTF-8?q?eout=20debug=20information=20for=20tim=E2=80=A6=20(#8523)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(router.py): add more deployment timeout debug information for timeout errors

help understand why some calls in high-traffic don't respect their model-specific timeouts

* test(test_convert_dict_to_response.py): unit test ensuring empty str is not converted to None

Addresses https://github.com/BerriAI/litellm/issues/8507

* fix(convert_dict_to_response.py): handle empty message str - don't return back as 'None'

Fixes https://github.com/BerriAI/litellm/issues/8507

* test(test_completion.py): add e2e test
---
 .../convert_dict_to_response.py               |  33 ++++--
 litellm/proxy/_new_secret_config.yaml         |  43 +------
 litellm/router.py                             |  45 ++++++--
 .../test_convert_dict_to_chat_completion.py   | 105 ++++++++++++++++++
 tests/local_testing/test_completion.py        |  68 ++++++++++++
 5 files changed, 234 insertions(+), 60 deletions(-)
diff --git a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
index def4c597f2..46d40be9c5 100644
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@@ -1,10 +1,10 @@
 import asyncio
 import json
+import re
 import time
 import traceback
 import uuid
-import re
-from typing import Dict, Iterable, List, Literal, Optional, Union, Tuple
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
 
 import litellm
 from litellm._logging import verbose_logger
@@ -221,17 +221,28 @@ def _handle_invalid_parallel_tool_calls(
         # if there is a JSONDecodeError, return the original tool_calls
         return tool_calls
 
-def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
+
+def _parse_content_for_reasoning(
+    message_text: Optional[str],
+) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Parse the content for reasoning
+
+    Returns:
+    - reasoning_content: The content of the reasoning
+    - content: The content of the message
+    """
     if not message_text:
-        return None, None
-    
+        return None, message_text
+
     reasoning_match = re.match(r"<think>(.*?)</think>(.*)", message_text, re.DOTALL)
 
     if reasoning_match:
         return reasoning_match.group(1), reasoning_match.group(2)
-    
+
     return None, message_text
 
+
 class LiteLLMResponseObjectHandler:
 
     @staticmethod
@@ -445,9 +456,15 @@ def convert_to_model_response_object(  # noqa: PLR0915
                             provider_specific_fields[field] = choice["message"][field]
 
                     # Handle reasoning models that display `reasoning_content` within `content`
-                    reasoning_content, content = _parse_content_for_reasoning(choice["message"].get("content", None))
+
+                    reasoning_content, content = _parse_content_for_reasoning(
+                        choice["message"].get("content")
+                    )
+
                     if reasoning_content:
-                        provider_specific_fields["reasoning_content"] = reasoning_content
+                        provider_specific_fields["reasoning_content"] = (
+                            reasoning_content
+                        )
 
                     message = Message(
                         content=content,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 51e88c0103..3f9a27f1c8 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,51 +1,10 @@
 model_list:
-  - model_name: gpt-3.5-turbo-testing
-    litellm_params:
-      model: gpt-3.5-turbo
-  - model_name: gpt-4
-    litellm_params:
-      model: gpt-3.5-turbo
-  - model_name: fake-openai-endpoint
-    litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
   - model_name: azure-gpt-35-turbo
     litellm_params:
       model: azure/chatgpt-v-2
       api_key: os.environ/AZURE_API_KEY
       api_base: os.environ/AZURE_API_BASE
       timeout: 0.000000001
-  - model_name: o3-mini
-    litellm_params:
-      model: o3-mini
-      rpm: 3
-  - model_name: anthropic-claude
-    litellm_params:
-      model: claude-3-5-haiku-20241022
-      timeout: 0.000000001
-  - model_name: groq/*
-    litellm_params:
-      model: groq/*
-      api_key: os.environ/GROQ_API_KEY
-      mock_response: Hi!
-  - model_name: deepseek/*
-    litellm_params:
-      model: deepseek/*
-      api_key: os.environ/DEEPSEEK_API_KEY
-  - model_name: fake-openai-endpoint
-    litellm_params:
-      model: openai/fake
-      api_key: fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: vertex_ai/gemini-*
-    litellm_params:
-      model: vertex_ai/gemini-*
-  - model_name: fake-azure-endpoint
-    litellm_params:
-     model: openai/429
-     api_key: fake-key
-     api_base: https://exampleopenaiendpoint-production.up.railway.app
 
 litellm_settings:
-  callbacks: ["prometheus"]
\ No newline at end of file
+  request_timeout: 10000
\ No newline at end of file
diff --git a/litellm/router.py b/litellm/router.py
index bdac540f1a..758a94464e 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -573,20 +573,32 @@ class Router:
             litellm.amoderation, call_type="moderation"
         )
 
-
     def discard(self):
         """
         Pseudo-destructor to be invoked to clean up global data structures when router is no longer used.
         For now, unhook router's callbacks from all lists
         """
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_success_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.success_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_failure_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.failure_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.input_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.service_callback, self)
-        litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.callbacks, self)
-
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm._async_success_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm.success_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm._async_failure_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm.failure_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm.input_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm.service_callback, self
+        )
+        litellm.logging_callback_manager.remove_callback_from_list_by_object(
+            litellm.callbacks, self
+        )
 
     def _update_redis_cache(self, cache: RedisCache):
         """
@@ -602,7 +614,6 @@ class Router:
         if self.cache.redis_cache is None:
             self.cache.redis_cache = cache
 
-
     def initialize_assistants_endpoint(self):
         ## INITIALIZE PASS THROUGH ASSISTANTS ENDPOINT ##
         self.acreate_assistants = self.factory_function(litellm.acreate_assistants)
@@ -902,6 +913,9 @@ class Router:
         - in the semaphore,  make a check against it's local rpm before running
         """
         model_name = None
+        _timeout_debug_deployment_dict = (
+            {}
+        )  # this is a temporary dict to debug timeout issues
         try:
             verbose_router_logger.debug(
                 f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
@@ -914,6 +928,7 @@ class Router:
                 specific_deployment=kwargs.pop("specific_deployment", None),
                 request_kwargs=kwargs,
             )
+            _timeout_debug_deployment_dict = deployment
             end_time = time.time()
             _duration = end_time - start_time
             asyncio.create_task(
@@ -1009,6 +1024,15 @@ class Router:
             )
 
             return response
+        except litellm.Timeout as e:
+            deployment_request_timeout_param = _timeout_debug_deployment_dict.get(
+                "litellm_params", {}
+            ).get("request_timeout", None)
+            deployment_timeout_param = _timeout_debug_deployment_dict.get(
+                "litellm_params", {}
+            ).get("timeout", None)
+            e.message += f"\n\nDeployment Info: request_timeout: {deployment_request_timeout_param}\ntimeout: {deployment_timeout_param}"
+            raise e
         except Exception as e:
             verbose_router_logger.info(
                 f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m"
@@ -3307,6 +3331,7 @@ class Router:
         _num_healthy_deployments = 0
         if healthy_deployments is not None and isinstance(healthy_deployments, list):
             _num_healthy_deployments = len(healthy_deployments)
+
         _num_all_deployments = 0
         if all_deployments is not None and isinstance(all_deployments, list):
             _num_all_deployments = len(all_deployments)
diff --git a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
index 3b0bd6ca82..e215ea147e 100644
--- a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
+++ b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
@@ -17,6 +17,7 @@ from litellm.types.utils import (
     Choices,
     PromptTokensDetailsWrapper,
     CompletionTokensDetailsWrapper,
+    Usage,
 )
 
 from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
@@ -750,3 +751,107 @@ def test_image_generation_openai_with_pydantic_warning(caplog):
         assert isinstance(resp.data[0], ImageObject)
     except Exception as e:
         pytest.fail(f"Test failed with exception: {e}")
+
+
+def test_convert_to_model_response_object_with_empty_str():
+    """Test that convert_to_model_response_object handles empty strings correctly."""
+
+    args = {
+        "response_object": {
+            "id": "chatcmpl-B0b1BmxhH4iSoRvFVbBJdLbMwr346",
+            "choices": [
+                {
+                    "finish_reason": "stop",
+                    "index": 0,
+                    "logprobs": None,
+                    "message": {
+                        "content": "",
+                        "refusal": None,
+                        "role": "assistant",
+                        "audio": None,
+                        "function_call": None,
+                        "tool_calls": None,
+                    },
+                }
+            ],
+            "created": 1739481997,
+            "model": "gpt-4o-mini-2024-07-18",
+            "object": "chat.completion",
+            "service_tier": "default",
+            "system_fingerprint": "fp_bd83329f63",
+            "usage": {
+                "completion_tokens": 1,
+                "prompt_tokens": 121,
+                "total_tokens": 122,
+                "completion_tokens_details": {
+                    "accepted_prediction_tokens": 0,
+                    "audio_tokens": 0,
+                    "reasoning_tokens": 0,
+                    "rejected_prediction_tokens": 0,
+                },
+                "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
+            },
+        },
+        "model_response_object": ModelResponse(
+            id="chatcmpl-9f9e5ad2-d570-46fe-a5e0-4983e9774318",
+            created=1739481997,
+            model=None,
+            object="chat.completion",
+            system_fingerprint=None,
+            choices=[
+                Choices(
+                    finish_reason="stop",
+                    index=0,
+                    message=Message(
+                        content=None,
+                        role="assistant",
+                        tool_calls=None,
+                        function_call=None,
+                        provider_specific_fields=None,
+                    ),
+                )
+            ],
+            usage=Usage(
+                completion_tokens=0,
+                prompt_tokens=0,
+                total_tokens=0,
+                completion_tokens_details=None,
+                prompt_tokens_details=None,
+            ),
+        ),
+        "response_type": "completion",
+        "stream": False,
+        "start_time": None,
+        "end_time": None,
+        "hidden_params": None,
+        "_response_headers": {
+            "date": "Thu, 13 Feb 2025 21:26:37 GMT",
+            "content-type": "application/json",
+            "transfer-encoding": "chunked",
+            "connection": "keep-alive",
+            "access-control-expose-headers": "X-Request-ID",
+            "openai-organization": "reliablekeystest",
+            "openai-processing-ms": "297",
+            "openai-version": "2020-10-01",
+            "x-ratelimit-limit-requests": "30000",
+            "x-ratelimit-limit-tokens": "150000000",
+            "x-ratelimit-remaining-requests": "29999",
+            "x-ratelimit-remaining-tokens": "149999846",
+            "x-ratelimit-reset-requests": "2ms",
+            "x-ratelimit-reset-tokens": "0s",
+            "x-request-id": "req_651030cbda2c80353086eba8fd0a54ec",
+            "strict-transport-security": "max-age=31536000; includeSubDomains; preload",
+            "cf-cache-status": "DYNAMIC",
+            "set-cookie": "__cf_bm=0ihEMDdqKfEr0I8iP4XZ7C6xEA5rJeAc11XFXNxZgyE-1739481997-1.0.1.1-v5jbjAWhMUZ0faO8q2izQljUQC.R85Vexb18A2MCyS895bur5eRxcguP0.WGY6EkxXSaOKN55VL3Pg3NOdq_xA; path=/; expires=Thu, 13-Feb-25 21:56:37 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=jrNMSOBRrxUnGgJ62BltpZZSNImfnEqPX9Uu8meGFLY-1739481997919-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
+            "x-content-type-options": "nosniff",
+            "server": "cloudflare",
+            "cf-ray": "9117e5d4caa1f7b5-LAX",
+            "content-encoding": "gzip",
+            "alt-svc": 'h3=":443"; ma=86400',
+        },
+        "convert_tool_call_to_json_mode": None,
+    }
+
+    resp: ModelResponse = convert_to_model_response_object(**args)
+    assert resp is not None
+    assert resp.choices[0].message.content is not None
diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py
index b4ff0526a4..f367f8dc03 100644
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@@ -4665,3 +4665,71 @@ def test_completion_o3_mini_temperature():
         assert resp.choices[0].message.content is not None
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
+
+
+def test_completion_gpt_4o_empty_str():
+    litellm._turn_on_debug()
+    from openai import OpenAI
+    from unittest.mock import MagicMock
+
+    client = OpenAI()
+
+    # Create response object matching OpenAI's format
+    mock_response_data = {
+        "id": "chatcmpl-B0W3vmiM78Xkgx7kI7dr7PC949DMS",
+        "choices": [
+            {
+                "finish_reason": "stop",
+                "index": 0,
+                "logprobs": None,
+                "message": {
+                    "content": "",
+                    "refusal": None,
+                    "role": "assistant",
+                    "audio": None,
+                    "function_call": None,
+                    "tool_calls": None,
+                },
+            }
+        ],
+        "created": 1739462947,
+        "model": "gpt-4o-mini-2024-07-18",
+        "object": "chat.completion",
+        "service_tier": "default",
+        "system_fingerprint": "fp_bd83329f63",
+        "usage": {
+            "completion_tokens": 1,
+            "prompt_tokens": 121,
+            "total_tokens": 122,
+            "completion_tokens_details": {
+                "accepted_prediction_tokens": 0,
+                "audio_tokens": 0,
+                "reasoning_tokens": 0,
+                "rejected_prediction_tokens": 0,
+            },
+            "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
+        },
+    }
+
+    # Create a mock response object
+    mock_raw_response = MagicMock()
+    mock_raw_response.headers = {
+        "x-request-id": "123",
+        "openai-organization": "org-123",
+        "x-ratelimit-limit-requests": "100",
+        "x-ratelimit-remaining-requests": "99",
+    }
+    mock_raw_response.parse.return_value = mock_response_data
+
+    # Set up the mock completion
+    mock_completion = MagicMock()
+    mock_completion.return_value = mock_raw_response
+
+    with patch.object(
+        client.chat.completions.with_raw_response, "create", mock_completion
+    ) as mock_create:
+        resp = litellm.completion(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": ""}],
+        )
+        assert resp.choices[0].message.content is not None