Improve Proxy Resiliency: Cooldown single-deployment model groups if 100% calls failed in high traffic (#7823)

* refactor(_is_cooldown_required): move '_is_cooldown_required' into cooldown_handlers.py

* refactor(cooldown_handlers.py): move cooldown constants into `.constants.py`

* fix(cooldown_handlers.py): remove if single deployment don't cooldown logic

move to traffic based cooldown logic

Addresses https://github.com/BerriAI/litellm/issues/7822

* fix: add unit tests for '_should_cooldown_deployment'

* test: ensure all tests pass

* test: update test

* fix(cooldown_handlers.py): don't cooldown single deployment models for anything besides traffic related errors

* fix(cooldown_handlers.py): fix cooldown handler logic

* fix(cooldown_handlers.py): fix check
This commit is contained in:
Krish Dholakia 2025-01-17 20:17:02 -08:00 committed by GitHub
parent d00febcdaa
commit 80f7af510b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 220 additions and 73 deletions

View file

@ -11,6 +11,11 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
import litellm
from litellm._logging import verbose_router_logger
from litellm.constants import (
DEFAULT_COOLDOWN_TIME_SECONDS,
DEFAULT_FAILURE_THRESHOLD_PERCENT,
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD,
)
from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
from .router_callbacks.track_deployment_metrics import (
@ -28,10 +33,62 @@ if TYPE_CHECKING:
else:
LitellmRouter = Any
Span = Any
DEFAULT_FAILURE_THRESHOLD_PERCENT = (
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
)
DEFAULT_COOLDOWN_TIME_SECONDS = 5
def _is_cooldown_required(
litellm_router_instance: LitellmRouter,
model_id: str,
exception_status: Union[str, int],
exception_str: Optional[str] = None,
) -> bool:
"""
A function to determine if a cooldown is required based on the exception status.
Parameters:
model_id (str) The id of the model in the model list
exception_status (Union[str, int]): The status of the exception.
Returns:
bool: True if a cooldown is required, False otherwise.
"""
try:
ignored_strings = ["APIConnectionError"]
if (
exception_str is not None
): # don't cooldown on litellm api connection errors errors
for ignored_string in ignored_strings:
if ignored_string in exception_str:
return False
if isinstance(exception_status, str):
exception_status = int(exception_status)
if exception_status >= 400 and exception_status < 500:
if exception_status == 429:
# Cool down 429 Rate Limit Errors
return True
elif exception_status == 401:
# Cool down 401 Auth Errors
return True
elif exception_status == 408:
return True
elif exception_status == 404:
return True
else:
# Do NOT cool down all other 4XX Errors
return False
else:
# should cool down for all other errors
return True
except Exception:
# Catch all - if any exceptions default to cooling down
return True
def _should_run_cooldown_logic(
@ -51,13 +108,20 @@ def _should_run_cooldown_logic(
- deployment is in litellm_router_instance.provider_default_deployment_ids
- exception_status is not one that should be immediately retried (e.g. 401)
"""
if (
deployment is None
or litellm_router_instance.get_model_group(id=deployment) is None
):
return False
if litellm_router_instance.disable_cooldowns:
return False
if deployment is None:
return False
if not litellm_router_instance._is_cooldown_required(
if not _is_cooldown_required(
litellm_router_instance=litellm_router_instance,
model_id=deployment,
exception_status=exception_status,
exception_str=str(original_exception),
@ -94,6 +158,11 @@ def _should_cooldown_deployment(
- v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
"""
## BASE CASE - single deployment
model_group = litellm_router_instance.get_model_group(id=deployment)
is_single_deployment_model_group = False
if model_group is not None and len(model_group) == 1:
is_single_deployment_model_group = True
if (
litellm_router_instance.allowed_fails_policy is None
and _is_allowed_fails_set_on_router(
@ -121,14 +190,21 @@ def _should_cooldown_deployment(
num_successes_this_minute,
num_fails_this_minute,
)
exception_status_int = cast_exception_status_to_int(exception_status)
if exception_status_int == 429:
if exception_status_int == 429 and not is_single_deployment_model_group:
return True
elif (
total_requests_this_minute == 1
): # if the 1st request fails it's not guaranteed that the deployment should be cooled down
return False
elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT:
percent_fails == 1.0
and total_requests_this_minute
>= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
):
# Cooldown if all requests failed and we have reasonable traffic
return True
elif (
percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT
and not is_single_deployment_model_group # by default we should avoid cooldowns on single deployment model groups
):
return True
elif (
@ -140,7 +216,7 @@ def _should_cooldown_deployment(
return True
return False
else:
elif not is_single_deployment_model_group:
return should_cooldown_based_on_allowed_fails_policy(
litellm_router_instance=litellm_router_instance,
deployment=deployment,