fix(router.py): add more deployment timeout debug information for tim… (#8523)

* fix(router.py): add more deployment timeout debug information for timeout errors

help understand why some calls in high-traffic don't respect their model-specific timeouts

* test(test_convert_dict_to_response.py): unit test ensuring empty str is not converted to None

Addresses https://github.com/BerriAI/litellm/issues/8507

* fix(convert_dict_to_response.py): handle empty message str - don't return back as 'None'

Fixes https://github.com/BerriAI/litellm/issues/8507

* test(test_completion.py): add e2e test
This commit is contained in:
Krish Dholakia 2025-02-13 17:10:22 -08:00 committed by GitHub
parent b70981b8fb
commit c399232b2c
5 changed files with 234 additions and 60 deletions

View file

@ -1,10 +1,10 @@
import asyncio
import json
import re
import time
import traceback
import uuid
import re
from typing import Dict, Iterable, List, Literal, Optional, Union, Tuple
from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
import litellm
from litellm._logging import verbose_logger
@ -221,9 +221,19 @@ def _handle_invalid_parallel_tool_calls(
# if there is a JSONDecodeError, return the original tool_calls
return tool_calls
def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
def _parse_content_for_reasoning(
message_text: Optional[str],
) -> Tuple[Optional[str], Optional[str]]:
"""
Parse the content for reasoning
Returns:
- reasoning_content: The content of the reasoning
- content: The content of the message
"""
if not message_text:
return None, None
return None, message_text
reasoning_match = re.match(r"<think>(.*?)</think>(.*)", message_text, re.DOTALL)
@ -232,6 +242,7 @@ def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[
return None, message_text
class LiteLLMResponseObjectHandler:
@staticmethod
@ -445,9 +456,15 @@ def convert_to_model_response_object( # noqa: PLR0915
provider_specific_fields[field] = choice["message"][field]
# Handle reasoning models that display `reasoning_content` within `content`
reasoning_content, content = _parse_content_for_reasoning(choice["message"].get("content", None))
reasoning_content, content = _parse_content_for_reasoning(
choice["message"].get("content")
)
if reasoning_content:
provider_specific_fields["reasoning_content"] = reasoning_content
provider_specific_fields["reasoning_content"] = (
reasoning_content
)
message = Message(
content=content,

View file

@ -1,51 +1,10 @@
model_list:
- model_name: gpt-3.5-turbo-testing
litellm_params:
model: gpt-3.5-turbo
- model_name: gpt-4
litellm_params:
model: gpt-3.5-turbo
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: azure-gpt-35-turbo
litellm_params:
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
timeout: 0.000000001
- model_name: o3-mini
litellm_params:
model: o3-mini
rpm: 3
- model_name: anthropic-claude
litellm_params:
model: claude-3-5-haiku-20241022
timeout: 0.000000001
- model_name: groq/*
litellm_params:
model: groq/*
api_key: os.environ/GROQ_API_KEY
mock_response: Hi!
- model_name: deepseek/*
litellm_params:
model: deepseek/*
api_key: os.environ/DEEPSEEK_API_KEY
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: vertex_ai/gemini-*
litellm_params:
model: vertex_ai/gemini-*
- model_name: fake-azure-endpoint
litellm_params:
model: openai/429
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app
litellm_settings:
callbacks: ["prometheus"]
request_timeout: 10000

View file

@ -573,20 +573,32 @@ class Router:
litellm.amoderation, call_type="moderation"
)
def discard(self):
"""
Pseudo-destructor to be invoked to clean up global data structures when router is no longer used.
For now, unhook router's callbacks from all lists
"""
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_success_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.success_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_failure_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.failure_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.input_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.service_callback, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.callbacks, self)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm._async_success_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.success_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm._async_failure_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.failure_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.input_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.service_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.callbacks, self
)
def _update_redis_cache(self, cache: RedisCache):
"""
@ -602,7 +614,6 @@ class Router:
if self.cache.redis_cache is None:
self.cache.redis_cache = cache
def initialize_assistants_endpoint(self):
## INITIALIZE PASS THROUGH ASSISTANTS ENDPOINT ##
self.acreate_assistants = self.factory_function(litellm.acreate_assistants)
@ -902,6 +913,9 @@ class Router:
- in the semaphore, make a check against it's local rpm before running
"""
model_name = None
_timeout_debug_deployment_dict = (
{}
) # this is a temporary dict to debug timeout issues
try:
verbose_router_logger.debug(
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
@ -914,6 +928,7 @@ class Router:
specific_deployment=kwargs.pop("specific_deployment", None),
request_kwargs=kwargs,
)
_timeout_debug_deployment_dict = deployment
end_time = time.time()
_duration = end_time - start_time
asyncio.create_task(
@ -1009,6 +1024,15 @@ class Router:
)
return response
except litellm.Timeout as e:
deployment_request_timeout_param = _timeout_debug_deployment_dict.get(
"litellm_params", {}
).get("request_timeout", None)
deployment_timeout_param = _timeout_debug_deployment_dict.get(
"litellm_params", {}
).get("timeout", None)
e.message += f"\n\nDeployment Info: request_timeout: {deployment_request_timeout_param}\ntimeout: {deployment_timeout_param}"
raise e
except Exception as e:
verbose_router_logger.info(
f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m"
@ -3307,6 +3331,7 @@ class Router:
_num_healthy_deployments = 0
if healthy_deployments is not None and isinstance(healthy_deployments, list):
_num_healthy_deployments = len(healthy_deployments)
_num_all_deployments = 0
if all_deployments is not None and isinstance(all_deployments, list):
_num_all_deployments = len(all_deployments)

View file

@ -17,6 +17,7 @@ from litellm.types.utils import (
Choices,
PromptTokensDetailsWrapper,
CompletionTokensDetailsWrapper,
Usage,
)
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
@ -750,3 +751,107 @@ def test_image_generation_openai_with_pydantic_warning(caplog):
assert isinstance(resp.data[0], ImageObject)
except Exception as e:
pytest.fail(f"Test failed with exception: {e}")
def test_convert_to_model_response_object_with_empty_str():
"""Test that convert_to_model_response_object handles empty strings correctly."""
args = {
"response_object": {
"id": "chatcmpl-B0b1BmxhH4iSoRvFVbBJdLbMwr346",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": "",
"refusal": None,
"role": "assistant",
"audio": None,
"function_call": None,
"tool_calls": None,
},
}
],
"created": 1739481997,
"model": "gpt-4o-mini-2024-07-18",
"object": "chat.completion",
"service_tier": "default",
"system_fingerprint": "fp_bd83329f63",
"usage": {
"completion_tokens": 1,
"prompt_tokens": 121,
"total_tokens": 122,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
},
"model_response_object": ModelResponse(
id="chatcmpl-9f9e5ad2-d570-46fe-a5e0-4983e9774318",
created=1739481997,
model=None,
object="chat.completion",
system_fingerprint=None,
choices=[
Choices(
finish_reason="stop",
index=0,
message=Message(
content=None,
role="assistant",
tool_calls=None,
function_call=None,
provider_specific_fields=None,
),
)
],
usage=Usage(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
completion_tokens_details=None,
prompt_tokens_details=None,
),
),
"response_type": "completion",
"stream": False,
"start_time": None,
"end_time": None,
"hidden_params": None,
"_response_headers": {
"date": "Thu, 13 Feb 2025 21:26:37 GMT",
"content-type": "application/json",
"transfer-encoding": "chunked",
"connection": "keep-alive",
"access-control-expose-headers": "X-Request-ID",
"openai-organization": "reliablekeystest",
"openai-processing-ms": "297",
"openai-version": "2020-10-01",
"x-ratelimit-limit-requests": "30000",
"x-ratelimit-limit-tokens": "150000000",
"x-ratelimit-remaining-requests": "29999",
"x-ratelimit-remaining-tokens": "149999846",
"x-ratelimit-reset-requests": "2ms",
"x-ratelimit-reset-tokens": "0s",
"x-request-id": "req_651030cbda2c80353086eba8fd0a54ec",
"strict-transport-security": "max-age=31536000; includeSubDomains; preload",
"cf-cache-status": "DYNAMIC",
"set-cookie": "__cf_bm=0ihEMDdqKfEr0I8iP4XZ7C6xEA5rJeAc11XFXNxZgyE-1739481997-1.0.1.1-v5jbjAWhMUZ0faO8q2izQljUQC.R85Vexb18A2MCyS895bur5eRxcguP0.WGY6EkxXSaOKN55VL3Pg3NOdq_xA; path=/; expires=Thu, 13-Feb-25 21:56:37 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=jrNMSOBRrxUnGgJ62BltpZZSNImfnEqPX9Uu8meGFLY-1739481997919-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
"x-content-type-options": "nosniff",
"server": "cloudflare",
"cf-ray": "9117e5d4caa1f7b5-LAX",
"content-encoding": "gzip",
"alt-svc": 'h3=":443"; ma=86400',
},
"convert_tool_call_to_json_mode": None,
}
resp: ModelResponse = convert_to_model_response_object(**args)
assert resp is not None
assert resp.choices[0].message.content is not None

View file

@ -4665,3 +4665,71 @@ def test_completion_o3_mini_temperature():
assert resp.choices[0].message.content is not None
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_completion_gpt_4o_empty_str():
litellm._turn_on_debug()
from openai import OpenAI
from unittest.mock import MagicMock
client = OpenAI()
# Create response object matching OpenAI's format
mock_response_data = {
"id": "chatcmpl-B0W3vmiM78Xkgx7kI7dr7PC949DMS",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": "",
"refusal": None,
"role": "assistant",
"audio": None,
"function_call": None,
"tool_calls": None,
},
}
],
"created": 1739462947,
"model": "gpt-4o-mini-2024-07-18",
"object": "chat.completion",
"service_tier": "default",
"system_fingerprint": "fp_bd83329f63",
"usage": {
"completion_tokens": 1,
"prompt_tokens": 121,
"total_tokens": 122,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
}
# Create a mock response object
mock_raw_response = MagicMock()
mock_raw_response.headers = {
"x-request-id": "123",
"openai-organization": "org-123",
"x-ratelimit-limit-requests": "100",
"x-ratelimit-remaining-requests": "99",
}
mock_raw_response.parse.return_value = mock_response_data
# Set up the mock completion
mock_completion = MagicMock()
mock_completion.return_value = mock_raw_response
with patch.object(
client.chat.completions.with_raw_response, "create", mock_completion
) as mock_create:
resp = litellm.completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": ""}],
)
assert resp.choices[0].message.content is not None