Improve Proxy Resiliency: Cooldown single-deployment model groups if 100% calls failed in high traffic (#7823)

* refactor(_is_cooldown_required): move '_is_cooldown_required' into cooldown_handlers.py * refactor(cooldown_handlers.py): move cooldown constants into `.constants.py` * fix(cooldown_handlers.py): remove if single deployment don't cooldown logic move to traffic based cooldown logic Addresses https://github.com/BerriAI/litellm/issues/7822 * fix: add unit tests for '_should_cooldown_deployment' * test: ensure all tests pass * test: update test * fix(cooldown_handlers.py): don't cooldown single deployment models for anything besides traffic related errors * fix(cooldown_handlers.py): fix cooldown handler logic * fix(cooldown_handlers.py): fix check
2025-04-26 03:04:13 +00:00 · 2025-01-17 20:17:02 -08:00 · 2025-01-17 20:17:02 -08:00 · 80f7af510b
commit 80f7af510b
parent d00febcdaa
5 changed files with 220 additions and 73 deletions
--- a/litellm/router_utils/cooldown_handlers.py
+++ b/litellm/router_utils/cooldown_handlers.py
@ -11,6 +11,11 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union

 import litellm
 from litellm._logging import verbose_router_logger
+from litellm.constants import (
+    DEFAULT_COOLDOWN_TIME_SECONDS,
+    DEFAULT_FAILURE_THRESHOLD_PERCENT,
+    SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD,
+)
 from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback

 from .router_callbacks.track_deployment_metrics import (
@ -28,10 +33,62 @@ if TYPE_CHECKING:
 else:
    LitellmRouter = Any
    Span = Any
-DEFAULT_FAILURE_THRESHOLD_PERCENT = (
-    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
-)
-DEFAULT_COOLDOWN_TIME_SECONDS = 5
+
+
+def _is_cooldown_required(
+    litellm_router_instance: LitellmRouter,
+    model_id: str,
+    exception_status: Union[str, int],
+    exception_str: Optional[str] = None,
+) -> bool:
+    """
+    A function to determine if a cooldown is required based on the exception status.
+
+    Parameters:
+        model_id (str) The id of the model in the model list
+        exception_status (Union[str, int]): The status of the exception.
+
+    Returns:
+        bool: True if a cooldown is required, False otherwise.
+    """
+    try:
+        ignored_strings = ["APIConnectionError"]
+        if (
+            exception_str is not None
+        ):  # don't cooldown on litellm api connection errors errors
+            for ignored_string in ignored_strings:
+                if ignored_string in exception_str:
+                    return False
+
+        if isinstance(exception_status, str):
+            exception_status = int(exception_status)
+
+        if exception_status >= 400 and exception_status < 500:
+            if exception_status == 429:
+                # Cool down 429 Rate Limit Errors
+                return True
+
+            elif exception_status == 401:
+                # Cool down 401 Auth Errors
+                return True
+
+            elif exception_status == 408:
+                return True
+
+            elif exception_status == 404:
+                return True
+
+            else:
+                # Do NOT cool down all other 4XX Errors
+                return False
+
+        else:
+            # should cool down for all other errors
+            return True
+
+    except Exception:
+        # Catch all - if any exceptions default to cooling down
+        return True


 def _should_run_cooldown_logic(
@ -51,13 +108,20 @@ def _should_run_cooldown_logic(
    - deployment is in litellm_router_instance.provider_default_deployment_ids
    - exception_status is not one that should be immediately retried (e.g. 401)
    """
+    if (
+        deployment is None
+        or litellm_router_instance.get_model_group(id=deployment) is None
+    ):
+        return False
+
    if litellm_router_instance.disable_cooldowns:
        return False

    if deployment is None:
        return False

-    if not litellm_router_instance._is_cooldown_required(
+    if not _is_cooldown_required(
+        litellm_router_instance=litellm_router_instance,
        model_id=deployment,
        exception_status=exception_status,
        exception_str=str(original_exception),
@ -94,6 +158,11 @@ def _should_cooldown_deployment(

    - v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
    """
+    ## BASE CASE - single deployment
+    model_group = litellm_router_instance.get_model_group(id=deployment)
+    is_single_deployment_model_group = False
+    if model_group is not None and len(model_group) == 1:
+        is_single_deployment_model_group = True
    if (
        litellm_router_instance.allowed_fails_policy is None
        and _is_allowed_fails_set_on_router(
@ -121,14 +190,21 @@ def _should_cooldown_deployment(
            num_successes_this_minute,
            num_fails_this_minute,
        )
+
        exception_status_int = cast_exception_status_to_int(exception_status)
-        if exception_status_int == 429:
+        if exception_status_int == 429 and not is_single_deployment_model_group:
            return True
        elif (
-            total_requests_this_minute == 1
-        ):  # if the 1st request fails it's not guaranteed that the deployment should be cooled down
-            return False
-        elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT:
+            percent_fails == 1.0
+            and total_requests_this_minute
+            >= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
+        ):
+            # Cooldown if all requests failed and we have reasonable traffic
+            return True
+        elif (
+            percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT
+            and not is_single_deployment_model_group  # by default we should avoid cooldowns on single deployment model groups
+        ):
            return True

        elif (
@ -140,7 +216,7 @@ def _should_cooldown_deployment(
            return True

        return False
-    else:
+    elif not is_single_deployment_model_group:
        return should_cooldown_based_on_allowed_fails_policy(
            litellm_router_instance=litellm_router_instance,
            deployment=deployment,