fix(router.py): add more deployment timeout debug information for tim… (#8523)

* fix(router.py): add more deployment timeout debug information for timeout errors

help understand why some calls in high-traffic don't respect their model-specific timeouts

* test(test_convert_dict_to_response.py): unit test ensuring empty str is not converted to None

Addresses https://github.com/BerriAI/litellm/issues/8507

* fix(convert_dict_to_response.py): handle empty message str - don't return back as 'None'

Fixes https://github.com/BerriAI/litellm/issues/8507

* test(test_completion.py): add e2e test
This commit is contained in:
Krish Dholakia 2025-02-13 17:10:22 -08:00 committed by GitHub
parent b70981b8fb
commit c399232b2c
5 changed files with 234 additions and 60 deletions

View file

@ -1,10 +1,10 @@
import asyncio import asyncio
import json import json
import re
import time import time
import traceback import traceback
import uuid import uuid
import re from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
from typing import Dict, Iterable, List, Literal, Optional, Union, Tuple
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
@ -221,17 +221,28 @@ def _handle_invalid_parallel_tool_calls(
# if there is a JSONDecodeError, return the original tool_calls # if there is a JSONDecodeError, return the original tool_calls
return tool_calls return tool_calls
def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
def _parse_content_for_reasoning(
message_text: Optional[str],
) -> Tuple[Optional[str], Optional[str]]:
"""
Parse the content for reasoning
Returns:
- reasoning_content: The content of the reasoning
- content: The content of the message
"""
if not message_text: if not message_text:
return None, None return None, message_text
reasoning_match = re.match(r"<think>(.*?)</think>(.*)", message_text, re.DOTALL) reasoning_match = re.match(r"<think>(.*?)</think>(.*)", message_text, re.DOTALL)
if reasoning_match: if reasoning_match:
return reasoning_match.group(1), reasoning_match.group(2) return reasoning_match.group(1), reasoning_match.group(2)
return None, message_text return None, message_text
class LiteLLMResponseObjectHandler: class LiteLLMResponseObjectHandler:
@staticmethod @staticmethod
@ -445,9 +456,15 @@ def convert_to_model_response_object( # noqa: PLR0915
provider_specific_fields[field] = choice["message"][field] provider_specific_fields[field] = choice["message"][field]
# Handle reasoning models that display `reasoning_content` within `content` # Handle reasoning models that display `reasoning_content` within `content`
reasoning_content, content = _parse_content_for_reasoning(choice["message"].get("content", None))
reasoning_content, content = _parse_content_for_reasoning(
choice["message"].get("content")
)
if reasoning_content: if reasoning_content:
provider_specific_fields["reasoning_content"] = reasoning_content provider_specific_fields["reasoning_content"] = (
reasoning_content
)
message = Message( message = Message(
content=content, content=content,

View file

@ -1,51 +1,10 @@
model_list: model_list:
- model_name: gpt-3.5-turbo-testing
litellm_params:
model: gpt-3.5-turbo
- model_name: gpt-4
litellm_params:
model: gpt-3.5-turbo
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: azure-gpt-35-turbo - model_name: azure-gpt-35-turbo
litellm_params: litellm_params:
model: azure/chatgpt-v-2 model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE api_base: os.environ/AZURE_API_BASE
timeout: 0.000000001 timeout: 0.000000001
- model_name: o3-mini
litellm_params:
model: o3-mini
rpm: 3
- model_name: anthropic-claude
litellm_params:
model: claude-3-5-haiku-20241022
timeout: 0.000000001
- model_name: groq/*
litellm_params:
model: groq/*
api_key: os.environ/GROQ_API_KEY
mock_response: Hi!
- model_name: deepseek/*
litellm_params:
model: deepseek/*
api_key: os.environ/DEEPSEEK_API_KEY
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: vertex_ai/gemini-*
litellm_params:
model: vertex_ai/gemini-*
- model_name: fake-azure-endpoint
litellm_params:
model: openai/429
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app
litellm_settings: litellm_settings:
callbacks: ["prometheus"] request_timeout: 10000

View file

@ -573,20 +573,32 @@ class Router:
litellm.amoderation, call_type="moderation" litellm.amoderation, call_type="moderation"
) )
def discard(self): def discard(self):
""" """
Pseudo-destructor to be invoked to clean up global data structures when router is no longer used. Pseudo-destructor to be invoked to clean up global data structures when router is no longer used.
For now, unhook router's callbacks from all lists For now, unhook router's callbacks from all lists
""" """
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_success_callback, self) litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.success_callback, self) litellm._async_success_callback, self
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_failure_callback, self) )
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.failure_callback, self) litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.input_callback, self) litellm.success_callback, self
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.service_callback, self) )
litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.callbacks, self) litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm._async_failure_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.failure_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.input_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.service_callback, self
)
litellm.logging_callback_manager.remove_callback_from_list_by_object(
litellm.callbacks, self
)
def _update_redis_cache(self, cache: RedisCache): def _update_redis_cache(self, cache: RedisCache):
""" """
@ -602,7 +614,6 @@ class Router:
if self.cache.redis_cache is None: if self.cache.redis_cache is None:
self.cache.redis_cache = cache self.cache.redis_cache = cache
def initialize_assistants_endpoint(self): def initialize_assistants_endpoint(self):
## INITIALIZE PASS THROUGH ASSISTANTS ENDPOINT ## ## INITIALIZE PASS THROUGH ASSISTANTS ENDPOINT ##
self.acreate_assistants = self.factory_function(litellm.acreate_assistants) self.acreate_assistants = self.factory_function(litellm.acreate_assistants)
@ -902,6 +913,9 @@ class Router:
- in the semaphore, make a check against it's local rpm before running - in the semaphore, make a check against it's local rpm before running
""" """
model_name = None model_name = None
_timeout_debug_deployment_dict = (
{}
) # this is a temporary dict to debug timeout issues
try: try:
verbose_router_logger.debug( verbose_router_logger.debug(
f"Inside _acompletion()- model: {model}; kwargs: {kwargs}" f"Inside _acompletion()- model: {model}; kwargs: {kwargs}"
@ -914,6 +928,7 @@ class Router:
specific_deployment=kwargs.pop("specific_deployment", None), specific_deployment=kwargs.pop("specific_deployment", None),
request_kwargs=kwargs, request_kwargs=kwargs,
) )
_timeout_debug_deployment_dict = deployment
end_time = time.time() end_time = time.time()
_duration = end_time - start_time _duration = end_time - start_time
asyncio.create_task( asyncio.create_task(
@ -1009,6 +1024,15 @@ class Router:
) )
return response return response
except litellm.Timeout as e:
deployment_request_timeout_param = _timeout_debug_deployment_dict.get(
"litellm_params", {}
).get("request_timeout", None)
deployment_timeout_param = _timeout_debug_deployment_dict.get(
"litellm_params", {}
).get("timeout", None)
e.message += f"\n\nDeployment Info: request_timeout: {deployment_request_timeout_param}\ntimeout: {deployment_timeout_param}"
raise e
except Exception as e: except Exception as e:
verbose_router_logger.info( verbose_router_logger.info(
f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m" f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m"
@ -3307,6 +3331,7 @@ class Router:
_num_healthy_deployments = 0 _num_healthy_deployments = 0
if healthy_deployments is not None and isinstance(healthy_deployments, list): if healthy_deployments is not None and isinstance(healthy_deployments, list):
_num_healthy_deployments = len(healthy_deployments) _num_healthy_deployments = len(healthy_deployments)
_num_all_deployments = 0 _num_all_deployments = 0
if all_deployments is not None and isinstance(all_deployments, list): if all_deployments is not None and isinstance(all_deployments, list):
_num_all_deployments = len(all_deployments) _num_all_deployments = len(all_deployments)

View file

@ -17,6 +17,7 @@ from litellm.types.utils import (
Choices, Choices,
PromptTokensDetailsWrapper, PromptTokensDetailsWrapper,
CompletionTokensDetailsWrapper, CompletionTokensDetailsWrapper,
Usage,
) )
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import ( from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
@ -750,3 +751,107 @@ def test_image_generation_openai_with_pydantic_warning(caplog):
assert isinstance(resp.data[0], ImageObject) assert isinstance(resp.data[0], ImageObject)
except Exception as e: except Exception as e:
pytest.fail(f"Test failed with exception: {e}") pytest.fail(f"Test failed with exception: {e}")
def test_convert_to_model_response_object_with_empty_str():
"""Test that convert_to_model_response_object handles empty strings correctly."""
args = {
"response_object": {
"id": "chatcmpl-B0b1BmxhH4iSoRvFVbBJdLbMwr346",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": "",
"refusal": None,
"role": "assistant",
"audio": None,
"function_call": None,
"tool_calls": None,
},
}
],
"created": 1739481997,
"model": "gpt-4o-mini-2024-07-18",
"object": "chat.completion",
"service_tier": "default",
"system_fingerprint": "fp_bd83329f63",
"usage": {
"completion_tokens": 1,
"prompt_tokens": 121,
"total_tokens": 122,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
},
"model_response_object": ModelResponse(
id="chatcmpl-9f9e5ad2-d570-46fe-a5e0-4983e9774318",
created=1739481997,
model=None,
object="chat.completion",
system_fingerprint=None,
choices=[
Choices(
finish_reason="stop",
index=0,
message=Message(
content=None,
role="assistant",
tool_calls=None,
function_call=None,
provider_specific_fields=None,
),
)
],
usage=Usage(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
completion_tokens_details=None,
prompt_tokens_details=None,
),
),
"response_type": "completion",
"stream": False,
"start_time": None,
"end_time": None,
"hidden_params": None,
"_response_headers": {
"date": "Thu, 13 Feb 2025 21:26:37 GMT",
"content-type": "application/json",
"transfer-encoding": "chunked",
"connection": "keep-alive",
"access-control-expose-headers": "X-Request-ID",
"openai-organization": "reliablekeystest",
"openai-processing-ms": "297",
"openai-version": "2020-10-01",
"x-ratelimit-limit-requests": "30000",
"x-ratelimit-limit-tokens": "150000000",
"x-ratelimit-remaining-requests": "29999",
"x-ratelimit-remaining-tokens": "149999846",
"x-ratelimit-reset-requests": "2ms",
"x-ratelimit-reset-tokens": "0s",
"x-request-id": "req_651030cbda2c80353086eba8fd0a54ec",
"strict-transport-security": "max-age=31536000; includeSubDomains; preload",
"cf-cache-status": "DYNAMIC",
"set-cookie": "__cf_bm=0ihEMDdqKfEr0I8iP4XZ7C6xEA5rJeAc11XFXNxZgyE-1739481997-1.0.1.1-v5jbjAWhMUZ0faO8q2izQljUQC.R85Vexb18A2MCyS895bur5eRxcguP0.WGY6EkxXSaOKN55VL3Pg3NOdq_xA; path=/; expires=Thu, 13-Feb-25 21:56:37 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=jrNMSOBRrxUnGgJ62BltpZZSNImfnEqPX9Uu8meGFLY-1739481997919-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
"x-content-type-options": "nosniff",
"server": "cloudflare",
"cf-ray": "9117e5d4caa1f7b5-LAX",
"content-encoding": "gzip",
"alt-svc": 'h3=":443"; ma=86400',
},
"convert_tool_call_to_json_mode": None,
}
resp: ModelResponse = convert_to_model_response_object(**args)
assert resp is not None
assert resp.choices[0].message.content is not None

View file

@ -4665,3 +4665,71 @@ def test_completion_o3_mini_temperature():
assert resp.choices[0].message.content is not None assert resp.choices[0].message.content is not None
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
def test_completion_gpt_4o_empty_str():
litellm._turn_on_debug()
from openai import OpenAI
from unittest.mock import MagicMock
client = OpenAI()
# Create response object matching OpenAI's format
mock_response_data = {
"id": "chatcmpl-B0W3vmiM78Xkgx7kI7dr7PC949DMS",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": "",
"refusal": None,
"role": "assistant",
"audio": None,
"function_call": None,
"tool_calls": None,
},
}
],
"created": 1739462947,
"model": "gpt-4o-mini-2024-07-18",
"object": "chat.completion",
"service_tier": "default",
"system_fingerprint": "fp_bd83329f63",
"usage": {
"completion_tokens": 1,
"prompt_tokens": 121,
"total_tokens": 122,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
}
# Create a mock response object
mock_raw_response = MagicMock()
mock_raw_response.headers = {
"x-request-id": "123",
"openai-organization": "org-123",
"x-ratelimit-limit-requests": "100",
"x-ratelimit-remaining-requests": "99",
}
mock_raw_response.parse.return_value = mock_response_data
# Set up the mock completion
mock_completion = MagicMock()
mock_completion.return_value = mock_raw_response
with patch.object(
client.chat.completions.with_raw_response, "create", mock_completion
) as mock_create:
resp = litellm.completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": ""}],
)
assert resp.choices[0].message.content is not None