Improve Proxy Resiliency: Cooldown single-deployment model groups if 100% calls failed in high traffic (#7823)

* refactor(_is_cooldown_required): move '_is_cooldown_required' into cooldown_handlers.py * refactor(cooldown_handlers.py): move cooldown constants into `.constants.py` * fix(cooldown_handlers.py): remove if single deployment don't cooldown logic move to traffic based cooldown logic Addresses https://github.com/BerriAI/litellm/issues/7822 * fix: add unit tests for '_should_cooldown_deployment' * test: ensure all tests pass * test: update test * fix(cooldown_handlers.py): don't cooldown single deployment models for anything besides traffic related errors * fix(cooldown_handlers.py): fix cooldown handler logic * fix(cooldown_handlers.py): fix check
2025-04-26 03:04:13 +00:00 · 2025-01-17 20:17:02 -08:00 · 2025-01-17 20:17:02 -08:00 · 80f7af510b
commit 80f7af510b
parent d00febcdaa
5 changed files with 220 additions and 73 deletions
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -2,11 +2,16 @@ ROUTER_MAX_FALLBACKS = 5
 DEFAULT_BATCH_SIZE = 512
 DEFAULT_FLUSH_INTERVAL_SECONDS = 5
 DEFAULT_MAX_RETRIES = 2
 DEFAULT_FAILURE_THRESHOLD_PERCENT = (
    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
 )
 DEFAULT_COOLDOWN_TIME_SECONDS = 5
 DEFAULT_REPLICATE_POLLING_RETRIES = 5
 DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
 DEFAULT_IMAGE_TOKEN_COUNT = 250
 DEFAULT_IMAGE_WIDTH = 300
 DEFAULT_IMAGE_HEIGHT = 300
 SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
 LITELLM_CHAT_PROVIDERS = [
    "openai",
    "openai_like",
--- a/litellm/router.py
+++ b/litellm/router.py
@ -3617,66 +3617,6 @@ class Router:
        return request_count
    def _is_cooldown_required(
        self,
        model_id: str,
        exception_status: Union[str, int],
        exception_str: Optional[str] = None,
    ) -> bool:
        """
        A function to determine if a cooldown is required based on the exception status.
        Parameters:
            model_id (str) The id of the model in the model list
            exception_status (Union[str, int]): The status of the exception.
        Returns:
            bool: True if a cooldown is required, False otherwise.
        """
        ## BASE CASE - single deployment
        model_group = self.get_model_group(id=model_id)
        if model_group is not None and len(model_group) == 1:
            return False
        try:
            ignored_strings = ["APIConnectionError"]
            if (
                exception_str is not None
            ):  # don't cooldown on litellm api connection errors errors
                for ignored_string in ignored_strings:
                    if ignored_string in exception_str:
                        return False
            if isinstance(exception_status, str):
                exception_status = int(exception_status)
            if exception_status >= 400 and exception_status < 500:
                if exception_status == 429:
                    # Cool down 429 Rate Limit Errors
                    return True
                elif exception_status == 401:
                    # Cool down 401 Auth Errors
                    return True
                elif exception_status == 408:
                    return True
                elif exception_status == 404:
                    return True
                else:
                    # Do NOT cool down all other 4XX Errors
                    return False
            else:
                # should cool down for all other errors
                return True
        except Exception:
            # Catch all - if any exceptions default to cooling down
            return True
    def _has_default_fallbacks(self) -> bool:
        if self.fallbacks is None:
            return False
--- a/litellm/router_utils/cooldown_handlers.py
+++ b/litellm/router_utils/cooldown_handlers.py
@ -11,6 +11,11 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
 import litellm
 from litellm._logging import verbose_router_logger
 from litellm.constants import (
    DEFAULT_COOLDOWN_TIME_SECONDS,
    DEFAULT_FAILURE_THRESHOLD_PERCENT,
    SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD,
 )
 from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
 from .router_callbacks.track_deployment_metrics import (
@ -28,10 +33,62 @@ if TYPE_CHECKING:
 else:
    LitellmRouter = Any
    Span = Any
-DEFAULT_FAILURE_THRESHOLD_PERCENT = (
+
-    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
+
-)
+def _is_cooldown_required(
-DEFAULT_COOLDOWN_TIME_SECONDS = 5
+    litellm_router_instance: LitellmRouter,
    model_id: str,
    exception_status: Union[str, int],
    exception_str: Optional[str] = None,
 ) -> bool:
    """
    A function to determine if a cooldown is required based on the exception status.
    Parameters:
        model_id (str) The id of the model in the model list
        exception_status (Union[str, int]): The status of the exception.
    Returns:
        bool: True if a cooldown is required, False otherwise.
    """
    try:
        ignored_strings = ["APIConnectionError"]
        if (
            exception_str is not None
        ):  # don't cooldown on litellm api connection errors errors
            for ignored_string in ignored_strings:
                if ignored_string in exception_str:
                    return False
        if isinstance(exception_status, str):
            exception_status = int(exception_status)
        if exception_status >= 400 and exception_status < 500:
            if exception_status == 429:
                # Cool down 429 Rate Limit Errors
                return True
            elif exception_status == 401:
                # Cool down 401 Auth Errors
                return True
            elif exception_status == 408:
                return True
            elif exception_status == 404:
                return True
            else:
                # Do NOT cool down all other 4XX Errors
                return False
        else:
            # should cool down for all other errors
            return True
    except Exception:
        # Catch all - if any exceptions default to cooling down
        return True
 def _should_run_cooldown_logic(
@ -51,13 +108,20 @@ def _should_run_cooldown_logic(
    - deployment is in litellm_router_instance.provider_default_deployment_ids
    - exception_status is not one that should be immediately retried (e.g. 401)
    """
    if (
        deployment is None
        or litellm_router_instance.get_model_group(id=deployment) is None
    ):
        return False
    if litellm_router_instance.disable_cooldowns:
        return False
    if deployment is None:
        return False
-    if not litellm_router_instance._is_cooldown_required(
+    if not _is_cooldown_required(
        litellm_router_instance=litellm_router_instance,
        model_id=deployment,
        exception_status=exception_status,
        exception_str=str(original_exception),
@ -94,6 +158,11 @@ def _should_cooldown_deployment(
    - v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
    """
    ## BASE CASE - single deployment
    model_group = litellm_router_instance.get_model_group(id=deployment)
    is_single_deployment_model_group = False
    if model_group is not None and len(model_group) == 1:
        is_single_deployment_model_group = True
    if (
        litellm_router_instance.allowed_fails_policy is None
        and _is_allowed_fails_set_on_router(
@ -121,14 +190,21 @@ def _should_cooldown_deployment(
            num_successes_this_minute,
            num_fails_this_minute,
        )
        exception_status_int = cast_exception_status_to_int(exception_status)
-        if exception_status_int == 429:
+        if exception_status_int == 429 and not is_single_deployment_model_group:
            return True
        elif (
-            total_requests_this_minute == 1
+            percent_fails == 1.0
-        ):  # if the 1st request fails it's not guaranteed that the deployment should be cooled down
+            and total_requests_this_minute
-            return False
+            >= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
-        elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT:
+        ):
            # Cooldown if all requests failed and we have reasonable traffic
            return True
        elif (
            percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT
            and not is_single_deployment_model_group  # by default we should avoid cooldowns on single deployment model groups
        ):
            return True
        elif (
@ -140,7 +216,7 @@ def _should_cooldown_deployment(
            return True
        return False
-    else:
+    elif not is_single_deployment_model_group:
        return should_cooldown_based_on_allowed_fails_policy(
            litellm_router_instance=litellm_router_instance,
            deployment=deployment,
--- a/tests/local_testing/test_router.py
+++ b/tests/local_testing/test_router.py
@ -2190,6 +2190,8 @@ def test_router_context_window_pre_call_check(model, base_model, llm_provider):
 def test_router_cooldown_api_connection_error():
    from litellm.router_utils.cooldown_handlers import _is_cooldown_required
    try:
        _ = litellm.completion(
            model="vertex_ai/gemini-1.5-pro",
@ -2197,8 +2199,11 @@ def test_router_cooldown_api_connection_error():
        )
    except litellm.APIConnectionError as e:
        assert (
-            Router()._is_cooldown_required(
+            _is_cooldown_required(
-                model_id="", exception_status=e.code, exception_str=str(e)
+                litellm_router_instance=Router(),
                model_id="",
                exception_status=e.code,
                exception_str=str(e),
            )
            is False
        )
--- a/tests/router_unit_tests/test_router_cooldown_utils.py
+++ b/tests/router_unit_tests/test_router_cooldown_utils.py
@ -25,6 +25,11 @@ from litellm.router_utils.router_callbacks.track_deployment_metrics import (
    increment_deployment_successes_for_current_minute,
 )
 import pytest
 from unittest.mock import patch
 from litellm import Router
 from litellm.router_utils.cooldown_handlers import _should_cooldown_deployment
 load_dotenv()
@ -183,6 +188,11 @@ def testing_litellm_router():
                "litellm_params": {"model": "openai/test_deployment"},
                "model_id": "test_deployment_2",
            },
            {
                "model_name": "test_deployment",
                "litellm_params": {"model": "openai/test_deployment-2"},
                "model_id": "test_deployment_3",
            },
        ]
    )
@ -395,3 +405,114 @@ def test_cast_exception_status_to_int():
    assert cast_exception_status_to_int(200) == 200
    assert cast_exception_status_to_int("404") == 404
    assert cast_exception_status_to_int("invalid") == 500
@pytest.fixture
 def router():
    return Router(
        model_list=[
            {
                "model_name": "gpt-4",
                "litellm_params": {"model": "gpt-4"},
                "model_info": {
                    "id": "gpt-4--0",
                },
            }
        ]
    )
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
 )
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
 )
 def test_should_cooldown_high_traffic_all_fails(mock_failures, mock_successes, router):
    # Simulate 10 failures, 0 successes
    from litellm.constants import SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
    mock_failures.return_value = SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD + 1
    mock_successes.return_value = 0
    should_cooldown = _should_cooldown_deployment(
        litellm_router_instance=router,
        deployment="gpt-4--0",
        exception_status=500,
        original_exception=Exception("Test error"),
    )
    assert (
        should_cooldown is True
    ), "Should cooldown when all requests fail with sufficient traffic"
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
 )
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
 )
 def test_no_cooldown_low_traffic(mock_failures, mock_successes, router):
    # Simulate 3 failures (below MIN_TRAFFIC_THRESHOLD)
    mock_failures.return_value = 3
    mock_successes.return_value = 0
    should_cooldown = _should_cooldown_deployment(
        litellm_router_instance=router,
        deployment="gpt-4--0",
        exception_status=500,
        original_exception=Exception("Test error"),
    )
    assert (
        should_cooldown is False
    ), "Should not cooldown when traffic is below threshold"
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
 )
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
 )
 def test_cooldown_rate_limit(mock_failures, mock_successes, router):
    """
    Don't cooldown single deployment models, for anything besides traffic
    """
    mock_failures.return_value = 1
    mock_successes.return_value = 0
    should_cooldown = _should_cooldown_deployment(
        litellm_router_instance=router,
        deployment="gpt-4--0",
        exception_status=429,  # Rate limit error
        original_exception=Exception("Rate limit exceeded"),
    )
    assert (
        should_cooldown is False
    ), "Should not cooldown on rate limit error for single deployment models"
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
 )
@patch(
    "litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
 )
 def test_mixed_success_failure(mock_failures, mock_successes, router):
    # Simulate 3 failures, 7 successes
    mock_failures.return_value = 3
    mock_successes.return_value = 7
    should_cooldown = _should_cooldown_deployment(
        litellm_router_instance=router,
        deployment="gpt-4--0",
        exception_status=500,
        original_exception=Exception("Test error"),
    )
    assert (
        should_cooldown is False
    ), "Should not cooldown when failure rate is below threshold"