From c399232b2ce379dd09e644a63bc3aa2680318ade Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Thu, 13 Feb 2025 17:10:22 -0800 Subject: [PATCH] =?UTF-8?q?fix(router.py):=20add=20more=20deployment=20tim?= =?UTF-8?q?eout=20debug=20information=20for=20tim=E2=80=A6=20(#8523)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(router.py): add more deployment timeout debug information for timeout errors help understand why some calls in high-traffic don't respect their model-specific timeouts * test(test_convert_dict_to_response.py): unit test ensuring empty str is not converted to None Addresses https://github.com/BerriAI/litellm/issues/8507 * fix(convert_dict_to_response.py): handle empty message str - don't return back as 'None' Fixes https://github.com/BerriAI/litellm/issues/8507 * test(test_completion.py): add e2e test --- .../convert_dict_to_response.py | 33 ++++-- litellm/proxy/_new_secret_config.yaml | 43 +------ litellm/router.py | 45 ++++++-- .../test_convert_dict_to_chat_completion.py | 105 ++++++++++++++++++ tests/local_testing/test_completion.py | 68 ++++++++++++ 5 files changed, 234 insertions(+), 60 deletions(-) diff --git a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py index def4c597f2..46d40be9c5 100644 --- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py +++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py @@ -1,10 +1,10 @@ import asyncio import json +import re import time import traceback import uuid -import re -from typing import Dict, Iterable, List, Literal, Optional, Union, Tuple +from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union import litellm from litellm._logging import verbose_logger @@ -221,17 +221,28 @@ def _handle_invalid_parallel_tool_calls( # if there is a JSONDecodeError, return the original tool_calls return tool_calls -def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[str], Optional[str]]: + +def _parse_content_for_reasoning( + message_text: Optional[str], +) -> Tuple[Optional[str], Optional[str]]: + """ + Parse the content for reasoning + + Returns: + - reasoning_content: The content of the reasoning + - content: The content of the message + """ if not message_text: - return None, None - + return None, message_text + reasoning_match = re.match(r"(.*?)(.*)", message_text, re.DOTALL) if reasoning_match: return reasoning_match.group(1), reasoning_match.group(2) - + return None, message_text + class LiteLLMResponseObjectHandler: @staticmethod @@ -445,9 +456,15 @@ def convert_to_model_response_object( # noqa: PLR0915 provider_specific_fields[field] = choice["message"][field] # Handle reasoning models that display `reasoning_content` within `content` - reasoning_content, content = _parse_content_for_reasoning(choice["message"].get("content", None)) + + reasoning_content, content = _parse_content_for_reasoning( + choice["message"].get("content") + ) + if reasoning_content: - provider_specific_fields["reasoning_content"] = reasoning_content + provider_specific_fields["reasoning_content"] = ( + reasoning_content + ) message = Message( content=content, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 51e88c0103..3f9a27f1c8 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,51 +1,10 @@ model_list: - - model_name: gpt-3.5-turbo-testing - litellm_params: - model: gpt-3.5-turbo - - model_name: gpt-4 - litellm_params: - model: gpt-3.5-turbo - - model_name: fake-openai-endpoint - litellm_params: - model: openai/fake - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - model_name: azure-gpt-35-turbo litellm_params: model: azure/chatgpt-v-2 api_key: os.environ/AZURE_API_KEY api_base: os.environ/AZURE_API_BASE timeout: 0.000000001 - - model_name: o3-mini - litellm_params: - model: o3-mini - rpm: 3 - - model_name: anthropic-claude - litellm_params: - model: claude-3-5-haiku-20241022 - timeout: 0.000000001 - - model_name: groq/* - litellm_params: - model: groq/* - api_key: os.environ/GROQ_API_KEY - mock_response: Hi! - - model_name: deepseek/* - litellm_params: - model: deepseek/* - api_key: os.environ/DEEPSEEK_API_KEY - - model_name: fake-openai-endpoint - litellm_params: - model: openai/fake - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - - model_name: vertex_ai/gemini-* - litellm_params: - model: vertex_ai/gemini-* - - model_name: fake-azure-endpoint - litellm_params: - model: openai/429 - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app litellm_settings: - callbacks: ["prometheus"] \ No newline at end of file + request_timeout: 10000 \ No newline at end of file diff --git a/litellm/router.py b/litellm/router.py index bdac540f1a..758a94464e 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -573,20 +573,32 @@ class Router: litellm.amoderation, call_type="moderation" ) - def discard(self): """ Pseudo-destructor to be invoked to clean up global data structures when router is no longer used. For now, unhook router's callbacks from all lists """ - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_success_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.success_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm._async_failure_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.failure_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.input_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.service_callback, self) - litellm.logging_callback_manager.remove_callback_from_list_by_object(litellm.callbacks, self) - + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm._async_success_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm.success_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm._async_failure_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm.failure_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm.input_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm.service_callback, self + ) + litellm.logging_callback_manager.remove_callback_from_list_by_object( + litellm.callbacks, self + ) def _update_redis_cache(self, cache: RedisCache): """ @@ -602,7 +614,6 @@ class Router: if self.cache.redis_cache is None: self.cache.redis_cache = cache - def initialize_assistants_endpoint(self): ## INITIALIZE PASS THROUGH ASSISTANTS ENDPOINT ## self.acreate_assistants = self.factory_function(litellm.acreate_assistants) @@ -902,6 +913,9 @@ class Router: - in the semaphore, make a check against it's local rpm before running """ model_name = None + _timeout_debug_deployment_dict = ( + {} + ) # this is a temporary dict to debug timeout issues try: verbose_router_logger.debug( f"Inside _acompletion()- model: {model}; kwargs: {kwargs}" @@ -914,6 +928,7 @@ class Router: specific_deployment=kwargs.pop("specific_deployment", None), request_kwargs=kwargs, ) + _timeout_debug_deployment_dict = deployment end_time = time.time() _duration = end_time - start_time asyncio.create_task( @@ -1009,6 +1024,15 @@ class Router: ) return response + except litellm.Timeout as e: + deployment_request_timeout_param = _timeout_debug_deployment_dict.get( + "litellm_params", {} + ).get("request_timeout", None) + deployment_timeout_param = _timeout_debug_deployment_dict.get( + "litellm_params", {} + ).get("timeout", None) + e.message += f"\n\nDeployment Info: request_timeout: {deployment_request_timeout_param}\ntimeout: {deployment_timeout_param}" + raise e except Exception as e: verbose_router_logger.info( f"litellm.acompletion(model={model_name})\033[31m Exception {str(e)}\033[0m" @@ -3307,6 +3331,7 @@ class Router: _num_healthy_deployments = 0 if healthy_deployments is not None and isinstance(healthy_deployments, list): _num_healthy_deployments = len(healthy_deployments) + _num_all_deployments = 0 if all_deployments is not None and isinstance(all_deployments, list): _num_all_deployments = len(all_deployments) diff --git a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py index 3b0bd6ca82..e215ea147e 100644 --- a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py +++ b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py @@ -17,6 +17,7 @@ from litellm.types.utils import ( Choices, PromptTokensDetailsWrapper, CompletionTokensDetailsWrapper, + Usage, ) from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import ( @@ -750,3 +751,107 @@ def test_image_generation_openai_with_pydantic_warning(caplog): assert isinstance(resp.data[0], ImageObject) except Exception as e: pytest.fail(f"Test failed with exception: {e}") + + +def test_convert_to_model_response_object_with_empty_str(): + """Test that convert_to_model_response_object handles empty strings correctly.""" + + args = { + "response_object": { + "id": "chatcmpl-B0b1BmxhH4iSoRvFVbBJdLbMwr346", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": "", + "refusal": None, + "role": "assistant", + "audio": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + "created": 1739481997, + "model": "gpt-4o-mini-2024-07-18", + "object": "chat.completion", + "service_tier": "default", + "system_fingerprint": "fp_bd83329f63", + "usage": { + "completion_tokens": 1, + "prompt_tokens": 121, + "total_tokens": 122, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0, + }, + "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, + }, + }, + "model_response_object": ModelResponse( + id="chatcmpl-9f9e5ad2-d570-46fe-a5e0-4983e9774318", + created=1739481997, + model=None, + object="chat.completion", + system_fingerprint=None, + choices=[ + Choices( + finish_reason="stop", + index=0, + message=Message( + content=None, + role="assistant", + tool_calls=None, + function_call=None, + provider_specific_fields=None, + ), + ) + ], + usage=Usage( + completion_tokens=0, + prompt_tokens=0, + total_tokens=0, + completion_tokens_details=None, + prompt_tokens_details=None, + ), + ), + "response_type": "completion", + "stream": False, + "start_time": None, + "end_time": None, + "hidden_params": None, + "_response_headers": { + "date": "Thu, 13 Feb 2025 21:26:37 GMT", + "content-type": "application/json", + "transfer-encoding": "chunked", + "connection": "keep-alive", + "access-control-expose-headers": "X-Request-ID", + "openai-organization": "reliablekeystest", + "openai-processing-ms": "297", + "openai-version": "2020-10-01", + "x-ratelimit-limit-requests": "30000", + "x-ratelimit-limit-tokens": "150000000", + "x-ratelimit-remaining-requests": "29999", + "x-ratelimit-remaining-tokens": "149999846", + "x-ratelimit-reset-requests": "2ms", + "x-ratelimit-reset-tokens": "0s", + "x-request-id": "req_651030cbda2c80353086eba8fd0a54ec", + "strict-transport-security": "max-age=31536000; includeSubDomains; preload", + "cf-cache-status": "DYNAMIC", + "set-cookie": "__cf_bm=0ihEMDdqKfEr0I8iP4XZ7C6xEA5rJeAc11XFXNxZgyE-1739481997-1.0.1.1-v5jbjAWhMUZ0faO8q2izQljUQC.R85Vexb18A2MCyS895bur5eRxcguP0.WGY6EkxXSaOKN55VL3Pg3NOdq_xA; path=/; expires=Thu, 13-Feb-25 21:56:37 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, _cfuvid=jrNMSOBRrxUnGgJ62BltpZZSNImfnEqPX9Uu8meGFLY-1739481997919-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "x-content-type-options": "nosniff", + "server": "cloudflare", + "cf-ray": "9117e5d4caa1f7b5-LAX", + "content-encoding": "gzip", + "alt-svc": 'h3=":443"; ma=86400', + }, + "convert_tool_call_to_json_mode": None, + } + + resp: ModelResponse = convert_to_model_response_object(**args) + assert resp is not None + assert resp.choices[0].message.content is not None diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py index b4ff0526a4..f367f8dc03 100644 --- a/tests/local_testing/test_completion.py +++ b/tests/local_testing/test_completion.py @@ -4665,3 +4665,71 @@ def test_completion_o3_mini_temperature(): assert resp.choices[0].message.content is not None except Exception as e: pytest.fail(f"Error occurred: {e}") + + +def test_completion_gpt_4o_empty_str(): + litellm._turn_on_debug() + from openai import OpenAI + from unittest.mock import MagicMock + + client = OpenAI() + + # Create response object matching OpenAI's format + mock_response_data = { + "id": "chatcmpl-B0W3vmiM78Xkgx7kI7dr7PC949DMS", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": "", + "refusal": None, + "role": "assistant", + "audio": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + "created": 1739462947, + "model": "gpt-4o-mini-2024-07-18", + "object": "chat.completion", + "service_tier": "default", + "system_fingerprint": "fp_bd83329f63", + "usage": { + "completion_tokens": 1, + "prompt_tokens": 121, + "total_tokens": 122, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0, + }, + "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, + }, + } + + # Create a mock response object + mock_raw_response = MagicMock() + mock_raw_response.headers = { + "x-request-id": "123", + "openai-organization": "org-123", + "x-ratelimit-limit-requests": "100", + "x-ratelimit-remaining-requests": "99", + } + mock_raw_response.parse.return_value = mock_response_data + + # Set up the mock completion + mock_completion = MagicMock() + mock_completion.return_value = mock_raw_response + + with patch.object( + client.chat.completions.with_raw_response, "create", mock_completion + ) as mock_create: + resp = litellm.completion( + model="gpt-4o-mini", + messages=[{"role": "user", "content": ""}], + ) + assert resp.choices[0].message.content is not None