diff --git a/litellm/constants.py b/litellm/constants.py index fc5f3d0448..e72d5facec 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -2,11 +2,16 @@ ROUTER_MAX_FALLBACKS = 5 DEFAULT_BATCH_SIZE = 512 DEFAULT_FLUSH_INTERVAL_SECONDS = 5 DEFAULT_MAX_RETRIES = 2 +DEFAULT_FAILURE_THRESHOLD_PERCENT = ( + 0.5 # default cooldown a deployment if 50% of requests fail in a given minute +) +DEFAULT_COOLDOWN_TIME_SECONDS = 5 DEFAULT_REPLICATE_POLLING_RETRIES = 5 DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1 DEFAULT_IMAGE_TOKEN_COUNT = 250 DEFAULT_IMAGE_WIDTH = 300 DEFAULT_IMAGE_HEIGHT = 300 +SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. LITELLM_CHAT_PROVIDERS = [ "openai", "openai_like", diff --git a/litellm/router.py b/litellm/router.py index cb22ac6d67..1747672bbb 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -3617,66 +3617,6 @@ class Router: return request_count - def _is_cooldown_required( - self, - model_id: str, - exception_status: Union[str, int], - exception_str: Optional[str] = None, - ) -> bool: - """ - A function to determine if a cooldown is required based on the exception status. - - Parameters: - model_id (str) The id of the model in the model list - exception_status (Union[str, int]): The status of the exception. - - Returns: - bool: True if a cooldown is required, False otherwise. - """ - ## BASE CASE - single deployment - model_group = self.get_model_group(id=model_id) - if model_group is not None and len(model_group) == 1: - return False - - try: - ignored_strings = ["APIConnectionError"] - if ( - exception_str is not None - ): # don't cooldown on litellm api connection errors errors - for ignored_string in ignored_strings: - if ignored_string in exception_str: - return False - - if isinstance(exception_status, str): - exception_status = int(exception_status) - - if exception_status >= 400 and exception_status < 500: - if exception_status == 429: - # Cool down 429 Rate Limit Errors - return True - - elif exception_status == 401: - # Cool down 401 Auth Errors - return True - - elif exception_status == 408: - return True - - elif exception_status == 404: - return True - - else: - # Do NOT cool down all other 4XX Errors - return False - - else: - # should cool down for all other errors - return True - - except Exception: - # Catch all - if any exceptions default to cooling down - return True - def _has_default_fallbacks(self) -> bool: if self.fallbacks is None: return False diff --git a/litellm/router_utils/cooldown_handlers.py b/litellm/router_utils/cooldown_handlers.py index 1e1c58a771..8f5c3895a6 100644 --- a/litellm/router_utils/cooldown_handlers.py +++ b/litellm/router_utils/cooldown_handlers.py @@ -11,6 +11,11 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union import litellm from litellm._logging import verbose_router_logger +from litellm.constants import ( + DEFAULT_COOLDOWN_TIME_SECONDS, + DEFAULT_FAILURE_THRESHOLD_PERCENT, + SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD, +) from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback from .router_callbacks.track_deployment_metrics import ( @@ -28,10 +33,62 @@ if TYPE_CHECKING: else: LitellmRouter = Any Span = Any -DEFAULT_FAILURE_THRESHOLD_PERCENT = ( - 0.5 # default cooldown a deployment if 50% of requests fail in a given minute -) -DEFAULT_COOLDOWN_TIME_SECONDS = 5 + + +def _is_cooldown_required( + litellm_router_instance: LitellmRouter, + model_id: str, + exception_status: Union[str, int], + exception_str: Optional[str] = None, +) -> bool: + """ + A function to determine if a cooldown is required based on the exception status. + + Parameters: + model_id (str) The id of the model in the model list + exception_status (Union[str, int]): The status of the exception. + + Returns: + bool: True if a cooldown is required, False otherwise. + """ + try: + ignored_strings = ["APIConnectionError"] + if ( + exception_str is not None + ): # don't cooldown on litellm api connection errors errors + for ignored_string in ignored_strings: + if ignored_string in exception_str: + return False + + if isinstance(exception_status, str): + exception_status = int(exception_status) + + if exception_status >= 400 and exception_status < 500: + if exception_status == 429: + # Cool down 429 Rate Limit Errors + return True + + elif exception_status == 401: + # Cool down 401 Auth Errors + return True + + elif exception_status == 408: + return True + + elif exception_status == 404: + return True + + else: + # Do NOT cool down all other 4XX Errors + return False + + else: + # should cool down for all other errors + return True + + except Exception: + # Catch all - if any exceptions default to cooling down + return True def _should_run_cooldown_logic( @@ -51,13 +108,20 @@ def _should_run_cooldown_logic( - deployment is in litellm_router_instance.provider_default_deployment_ids - exception_status is not one that should be immediately retried (e.g. 401) """ + if ( + deployment is None + or litellm_router_instance.get_model_group(id=deployment) is None + ): + return False + if litellm_router_instance.disable_cooldowns: return False if deployment is None: return False - if not litellm_router_instance._is_cooldown_required( + if not _is_cooldown_required( + litellm_router_instance=litellm_router_instance, model_id=deployment, exception_status=exception_status, exception_str=str(original_exception), @@ -94,6 +158,11 @@ def _should_cooldown_deployment( - v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails """ + ## BASE CASE - single deployment + model_group = litellm_router_instance.get_model_group(id=deployment) + is_single_deployment_model_group = False + if model_group is not None and len(model_group) == 1: + is_single_deployment_model_group = True if ( litellm_router_instance.allowed_fails_policy is None and _is_allowed_fails_set_on_router( @@ -121,14 +190,21 @@ def _should_cooldown_deployment( num_successes_this_minute, num_fails_this_minute, ) + exception_status_int = cast_exception_status_to_int(exception_status) - if exception_status_int == 429: + if exception_status_int == 429 and not is_single_deployment_model_group: return True elif ( - total_requests_this_minute == 1 - ): # if the 1st request fails it's not guaranteed that the deployment should be cooled down - return False - elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT: + percent_fails == 1.0 + and total_requests_this_minute + >= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD + ): + # Cooldown if all requests failed and we have reasonable traffic + return True + elif ( + percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT + and not is_single_deployment_model_group # by default we should avoid cooldowns on single deployment model groups + ): return True elif ( @@ -140,7 +216,7 @@ def _should_cooldown_deployment( return True return False - else: + elif not is_single_deployment_model_group: return should_cooldown_based_on_allowed_fails_policy( litellm_router_instance=litellm_router_instance, deployment=deployment, diff --git a/tests/local_testing/test_router.py b/tests/local_testing/test_router.py index d2c1c8fbec..965b47f572 100644 --- a/tests/local_testing/test_router.py +++ b/tests/local_testing/test_router.py @@ -2190,6 +2190,8 @@ def test_router_context_window_pre_call_check(model, base_model, llm_provider): def test_router_cooldown_api_connection_error(): + from litellm.router_utils.cooldown_handlers import _is_cooldown_required + try: _ = litellm.completion( model="vertex_ai/gemini-1.5-pro", @@ -2197,8 +2199,11 @@ def test_router_cooldown_api_connection_error(): ) except litellm.APIConnectionError as e: assert ( - Router()._is_cooldown_required( - model_id="", exception_status=e.code, exception_str=str(e) + _is_cooldown_required( + litellm_router_instance=Router(), + model_id="", + exception_status=e.code, + exception_str=str(e), ) is False ) diff --git a/tests/router_unit_tests/test_router_cooldown_utils.py b/tests/router_unit_tests/test_router_cooldown_utils.py index 1623808fc1..4ac703419a 100644 --- a/tests/router_unit_tests/test_router_cooldown_utils.py +++ b/tests/router_unit_tests/test_router_cooldown_utils.py @@ -25,6 +25,11 @@ from litellm.router_utils.router_callbacks.track_deployment_metrics import ( increment_deployment_successes_for_current_minute, ) +import pytest +from unittest.mock import patch +from litellm import Router +from litellm.router_utils.cooldown_handlers import _should_cooldown_deployment + load_dotenv() @@ -183,6 +188,11 @@ def testing_litellm_router(): "litellm_params": {"model": "openai/test_deployment"}, "model_id": "test_deployment_2", }, + { + "model_name": "test_deployment", + "litellm_params": {"model": "openai/test_deployment-2"}, + "model_id": "test_deployment_3", + }, ] ) @@ -395,3 +405,114 @@ def test_cast_exception_status_to_int(): assert cast_exception_status_to_int(200) == 200 assert cast_exception_status_to_int("404") == 404 assert cast_exception_status_to_int("invalid") == 500 + + +@pytest.fixture +def router(): + return Router( + model_list=[ + { + "model_name": "gpt-4", + "litellm_params": {"model": "gpt-4"}, + "model_info": { + "id": "gpt-4--0", + }, + } + ] + ) + + +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute" +) +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute" +) +def test_should_cooldown_high_traffic_all_fails(mock_failures, mock_successes, router): + # Simulate 10 failures, 0 successes + from litellm.constants import SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD + + mock_failures.return_value = SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD + 1 + mock_successes.return_value = 0 + + should_cooldown = _should_cooldown_deployment( + litellm_router_instance=router, + deployment="gpt-4--0", + exception_status=500, + original_exception=Exception("Test error"), + ) + + assert ( + should_cooldown is True + ), "Should cooldown when all requests fail with sufficient traffic" + + +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute" +) +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute" +) +def test_no_cooldown_low_traffic(mock_failures, mock_successes, router): + # Simulate 3 failures (below MIN_TRAFFIC_THRESHOLD) + mock_failures.return_value = 3 + mock_successes.return_value = 0 + + should_cooldown = _should_cooldown_deployment( + litellm_router_instance=router, + deployment="gpt-4--0", + exception_status=500, + original_exception=Exception("Test error"), + ) + + assert ( + should_cooldown is False + ), "Should not cooldown when traffic is below threshold" + + +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute" +) +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute" +) +def test_cooldown_rate_limit(mock_failures, mock_successes, router): + """ + Don't cooldown single deployment models, for anything besides traffic + """ + mock_failures.return_value = 1 + mock_successes.return_value = 0 + + should_cooldown = _should_cooldown_deployment( + litellm_router_instance=router, + deployment="gpt-4--0", + exception_status=429, # Rate limit error + original_exception=Exception("Rate limit exceeded"), + ) + + assert ( + should_cooldown is False + ), "Should not cooldown on rate limit error for single deployment models" + + +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute" +) +@patch( + "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute" +) +def test_mixed_success_failure(mock_failures, mock_successes, router): + # Simulate 3 failures, 7 successes + mock_failures.return_value = 3 + mock_successes.return_value = 7 + + should_cooldown = _should_cooldown_deployment( + litellm_router_instance=router, + deployment="gpt-4--0", + exception_status=500, + original_exception=Exception("Test error"), + ) + + assert ( + should_cooldown is False + ), "Should not cooldown when failure rate is below threshold"