mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
Improve Proxy Resiliency: Cooldown single-deployment model groups if 100% calls failed in high traffic (#7823)
* refactor(_is_cooldown_required): move '_is_cooldown_required' into cooldown_handlers.py * refactor(cooldown_handlers.py): move cooldown constants into `.constants.py` * fix(cooldown_handlers.py): remove if single deployment don't cooldown logic move to traffic based cooldown logic Addresses https://github.com/BerriAI/litellm/issues/7822 * fix: add unit tests for '_should_cooldown_deployment' * test: ensure all tests pass * test: update test * fix(cooldown_handlers.py): don't cooldown single deployment models for anything besides traffic related errors * fix(cooldown_handlers.py): fix cooldown handler logic * fix(cooldown_handlers.py): fix check
This commit is contained in:
parent
d00febcdaa
commit
80f7af510b
5 changed files with 220 additions and 73 deletions
|
@ -2,11 +2,16 @@ ROUTER_MAX_FALLBACKS = 5
|
||||||
DEFAULT_BATCH_SIZE = 512
|
DEFAULT_BATCH_SIZE = 512
|
||||||
DEFAULT_FLUSH_INTERVAL_SECONDS = 5
|
DEFAULT_FLUSH_INTERVAL_SECONDS = 5
|
||||||
DEFAULT_MAX_RETRIES = 2
|
DEFAULT_MAX_RETRIES = 2
|
||||||
|
DEFAULT_FAILURE_THRESHOLD_PERCENT = (
|
||||||
|
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
||||||
|
)
|
||||||
|
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
||||||
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
DEFAULT_REPLICATE_POLLING_RETRIES = 5
|
||||||
DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
|
DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
|
||||||
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
DEFAULT_IMAGE_TOKEN_COUNT = 250
|
||||||
DEFAULT_IMAGE_WIDTH = 300
|
DEFAULT_IMAGE_WIDTH = 300
|
||||||
DEFAULT_IMAGE_HEIGHT = 300
|
DEFAULT_IMAGE_HEIGHT = 300
|
||||||
|
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
|
||||||
LITELLM_CHAT_PROVIDERS = [
|
LITELLM_CHAT_PROVIDERS = [
|
||||||
"openai",
|
"openai",
|
||||||
"openai_like",
|
"openai_like",
|
||||||
|
|
|
@ -3617,66 +3617,6 @@ class Router:
|
||||||
|
|
||||||
return request_count
|
return request_count
|
||||||
|
|
||||||
def _is_cooldown_required(
|
|
||||||
self,
|
|
||||||
model_id: str,
|
|
||||||
exception_status: Union[str, int],
|
|
||||||
exception_str: Optional[str] = None,
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
A function to determine if a cooldown is required based on the exception status.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
model_id (str) The id of the model in the model list
|
|
||||||
exception_status (Union[str, int]): The status of the exception.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if a cooldown is required, False otherwise.
|
|
||||||
"""
|
|
||||||
## BASE CASE - single deployment
|
|
||||||
model_group = self.get_model_group(id=model_id)
|
|
||||||
if model_group is not None and len(model_group) == 1:
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
ignored_strings = ["APIConnectionError"]
|
|
||||||
if (
|
|
||||||
exception_str is not None
|
|
||||||
): # don't cooldown on litellm api connection errors errors
|
|
||||||
for ignored_string in ignored_strings:
|
|
||||||
if ignored_string in exception_str:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if isinstance(exception_status, str):
|
|
||||||
exception_status = int(exception_status)
|
|
||||||
|
|
||||||
if exception_status >= 400 and exception_status < 500:
|
|
||||||
if exception_status == 429:
|
|
||||||
# Cool down 429 Rate Limit Errors
|
|
||||||
return True
|
|
||||||
|
|
||||||
elif exception_status == 401:
|
|
||||||
# Cool down 401 Auth Errors
|
|
||||||
return True
|
|
||||||
|
|
||||||
elif exception_status == 408:
|
|
||||||
return True
|
|
||||||
|
|
||||||
elif exception_status == 404:
|
|
||||||
return True
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Do NOT cool down all other 4XX Errors
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
# should cool down for all other errors
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
# Catch all - if any exceptions default to cooling down
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _has_default_fallbacks(self) -> bool:
|
def _has_default_fallbacks(self) -> bool:
|
||||||
if self.fallbacks is None:
|
if self.fallbacks is None:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -11,6 +11,11 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import verbose_router_logger
|
from litellm._logging import verbose_router_logger
|
||||||
|
from litellm.constants import (
|
||||||
|
DEFAULT_COOLDOWN_TIME_SECONDS,
|
||||||
|
DEFAULT_FAILURE_THRESHOLD_PERCENT,
|
||||||
|
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD,
|
||||||
|
)
|
||||||
from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
|
from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
|
||||||
|
|
||||||
from .router_callbacks.track_deployment_metrics import (
|
from .router_callbacks.track_deployment_metrics import (
|
||||||
|
@ -28,10 +33,62 @@ if TYPE_CHECKING:
|
||||||
else:
|
else:
|
||||||
LitellmRouter = Any
|
LitellmRouter = Any
|
||||||
Span = Any
|
Span = Any
|
||||||
DEFAULT_FAILURE_THRESHOLD_PERCENT = (
|
|
||||||
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
|
||||||
)
|
def _is_cooldown_required(
|
||||||
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
litellm_router_instance: LitellmRouter,
|
||||||
|
model_id: str,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
exception_str: Optional[str] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
A function to determine if a cooldown is required based on the exception status.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
model_id (str) The id of the model in the model list
|
||||||
|
exception_status (Union[str, int]): The status of the exception.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if a cooldown is required, False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
ignored_strings = ["APIConnectionError"]
|
||||||
|
if (
|
||||||
|
exception_str is not None
|
||||||
|
): # don't cooldown on litellm api connection errors errors
|
||||||
|
for ignored_string in ignored_strings:
|
||||||
|
if ignored_string in exception_str:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if isinstance(exception_status, str):
|
||||||
|
exception_status = int(exception_status)
|
||||||
|
|
||||||
|
if exception_status >= 400 and exception_status < 500:
|
||||||
|
if exception_status == 429:
|
||||||
|
# Cool down 429 Rate Limit Errors
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif exception_status == 401:
|
||||||
|
# Cool down 401 Auth Errors
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif exception_status == 408:
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif exception_status == 404:
|
||||||
|
return True
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Do NOT cool down all other 4XX Errors
|
||||||
|
return False
|
||||||
|
|
||||||
|
else:
|
||||||
|
# should cool down for all other errors
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Catch all - if any exceptions default to cooling down
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _should_run_cooldown_logic(
|
def _should_run_cooldown_logic(
|
||||||
|
@ -51,13 +108,20 @@ def _should_run_cooldown_logic(
|
||||||
- deployment is in litellm_router_instance.provider_default_deployment_ids
|
- deployment is in litellm_router_instance.provider_default_deployment_ids
|
||||||
- exception_status is not one that should be immediately retried (e.g. 401)
|
- exception_status is not one that should be immediately retried (e.g. 401)
|
||||||
"""
|
"""
|
||||||
|
if (
|
||||||
|
deployment is None
|
||||||
|
or litellm_router_instance.get_model_group(id=deployment) is None
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
|
||||||
if litellm_router_instance.disable_cooldowns:
|
if litellm_router_instance.disable_cooldowns:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not litellm_router_instance._is_cooldown_required(
|
if not _is_cooldown_required(
|
||||||
|
litellm_router_instance=litellm_router_instance,
|
||||||
model_id=deployment,
|
model_id=deployment,
|
||||||
exception_status=exception_status,
|
exception_status=exception_status,
|
||||||
exception_str=str(original_exception),
|
exception_str=str(original_exception),
|
||||||
|
@ -94,6 +158,11 @@ def _should_cooldown_deployment(
|
||||||
|
|
||||||
- v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
|
- v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
|
||||||
"""
|
"""
|
||||||
|
## BASE CASE - single deployment
|
||||||
|
model_group = litellm_router_instance.get_model_group(id=deployment)
|
||||||
|
is_single_deployment_model_group = False
|
||||||
|
if model_group is not None and len(model_group) == 1:
|
||||||
|
is_single_deployment_model_group = True
|
||||||
if (
|
if (
|
||||||
litellm_router_instance.allowed_fails_policy is None
|
litellm_router_instance.allowed_fails_policy is None
|
||||||
and _is_allowed_fails_set_on_router(
|
and _is_allowed_fails_set_on_router(
|
||||||
|
@ -121,14 +190,21 @@ def _should_cooldown_deployment(
|
||||||
num_successes_this_minute,
|
num_successes_this_minute,
|
||||||
num_fails_this_minute,
|
num_fails_this_minute,
|
||||||
)
|
)
|
||||||
|
|
||||||
exception_status_int = cast_exception_status_to_int(exception_status)
|
exception_status_int = cast_exception_status_to_int(exception_status)
|
||||||
if exception_status_int == 429:
|
if exception_status_int == 429 and not is_single_deployment_model_group:
|
||||||
return True
|
return True
|
||||||
elif (
|
elif (
|
||||||
total_requests_this_minute == 1
|
percent_fails == 1.0
|
||||||
): # if the 1st request fails it's not guaranteed that the deployment should be cooled down
|
and total_requests_this_minute
|
||||||
return False
|
>= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
|
||||||
elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT:
|
):
|
||||||
|
# Cooldown if all requests failed and we have reasonable traffic
|
||||||
|
return True
|
||||||
|
elif (
|
||||||
|
percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT
|
||||||
|
and not is_single_deployment_model_group # by default we should avoid cooldowns on single deployment model groups
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
|
@ -140,7 +216,7 @@ def _should_cooldown_deployment(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
else:
|
elif not is_single_deployment_model_group:
|
||||||
return should_cooldown_based_on_allowed_fails_policy(
|
return should_cooldown_based_on_allowed_fails_policy(
|
||||||
litellm_router_instance=litellm_router_instance,
|
litellm_router_instance=litellm_router_instance,
|
||||||
deployment=deployment,
|
deployment=deployment,
|
||||||
|
|
|
@ -2190,6 +2190,8 @@ def test_router_context_window_pre_call_check(model, base_model, llm_provider):
|
||||||
|
|
||||||
|
|
||||||
def test_router_cooldown_api_connection_error():
|
def test_router_cooldown_api_connection_error():
|
||||||
|
from litellm.router_utils.cooldown_handlers import _is_cooldown_required
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_ = litellm.completion(
|
_ = litellm.completion(
|
||||||
model="vertex_ai/gemini-1.5-pro",
|
model="vertex_ai/gemini-1.5-pro",
|
||||||
|
@ -2197,8 +2199,11 @@ def test_router_cooldown_api_connection_error():
|
||||||
)
|
)
|
||||||
except litellm.APIConnectionError as e:
|
except litellm.APIConnectionError as e:
|
||||||
assert (
|
assert (
|
||||||
Router()._is_cooldown_required(
|
_is_cooldown_required(
|
||||||
model_id="", exception_status=e.code, exception_str=str(e)
|
litellm_router_instance=Router(),
|
||||||
|
model_id="",
|
||||||
|
exception_status=e.code,
|
||||||
|
exception_str=str(e),
|
||||||
)
|
)
|
||||||
is False
|
is False
|
||||||
)
|
)
|
||||||
|
|
|
@ -25,6 +25,11 @@ from litellm.router_utils.router_callbacks.track_deployment_metrics import (
|
||||||
increment_deployment_successes_for_current_minute,
|
increment_deployment_successes_for_current_minute,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch
|
||||||
|
from litellm import Router
|
||||||
|
from litellm.router_utils.cooldown_handlers import _should_cooldown_deployment
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@ -183,6 +188,11 @@ def testing_litellm_router():
|
||||||
"litellm_params": {"model": "openai/test_deployment"},
|
"litellm_params": {"model": "openai/test_deployment"},
|
||||||
"model_id": "test_deployment_2",
|
"model_id": "test_deployment_2",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"model_name": "test_deployment",
|
||||||
|
"litellm_params": {"model": "openai/test_deployment-2"},
|
||||||
|
"model_id": "test_deployment_3",
|
||||||
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -395,3 +405,114 @@ def test_cast_exception_status_to_int():
|
||||||
assert cast_exception_status_to_int(200) == 200
|
assert cast_exception_status_to_int(200) == 200
|
||||||
assert cast_exception_status_to_int("404") == 404
|
assert cast_exception_status_to_int("404") == 404
|
||||||
assert cast_exception_status_to_int("invalid") == 500
|
assert cast_exception_status_to_int("invalid") == 500
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def router():
|
||||||
|
return Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {"model": "gpt-4"},
|
||||||
|
"model_info": {
|
||||||
|
"id": "gpt-4--0",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
|
||||||
|
)
|
||||||
|
def test_should_cooldown_high_traffic_all_fails(mock_failures, mock_successes, router):
|
||||||
|
# Simulate 10 failures, 0 successes
|
||||||
|
from litellm.constants import SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
|
||||||
|
|
||||||
|
mock_failures.return_value = SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD + 1
|
||||||
|
mock_successes.return_value = 0
|
||||||
|
|
||||||
|
should_cooldown = _should_cooldown_deployment(
|
||||||
|
litellm_router_instance=router,
|
||||||
|
deployment="gpt-4--0",
|
||||||
|
exception_status=500,
|
||||||
|
original_exception=Exception("Test error"),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
should_cooldown is True
|
||||||
|
), "Should cooldown when all requests fail with sufficient traffic"
|
||||||
|
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
|
||||||
|
)
|
||||||
|
def test_no_cooldown_low_traffic(mock_failures, mock_successes, router):
|
||||||
|
# Simulate 3 failures (below MIN_TRAFFIC_THRESHOLD)
|
||||||
|
mock_failures.return_value = 3
|
||||||
|
mock_successes.return_value = 0
|
||||||
|
|
||||||
|
should_cooldown = _should_cooldown_deployment(
|
||||||
|
litellm_router_instance=router,
|
||||||
|
deployment="gpt-4--0",
|
||||||
|
exception_status=500,
|
||||||
|
original_exception=Exception("Test error"),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
should_cooldown is False
|
||||||
|
), "Should not cooldown when traffic is below threshold"
|
||||||
|
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
|
||||||
|
)
|
||||||
|
def test_cooldown_rate_limit(mock_failures, mock_successes, router):
|
||||||
|
"""
|
||||||
|
Don't cooldown single deployment models, for anything besides traffic
|
||||||
|
"""
|
||||||
|
mock_failures.return_value = 1
|
||||||
|
mock_successes.return_value = 0
|
||||||
|
|
||||||
|
should_cooldown = _should_cooldown_deployment(
|
||||||
|
litellm_router_instance=router,
|
||||||
|
deployment="gpt-4--0",
|
||||||
|
exception_status=429, # Rate limit error
|
||||||
|
original_exception=Exception("Rate limit exceeded"),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
should_cooldown is False
|
||||||
|
), "Should not cooldown on rate limit error for single deployment models"
|
||||||
|
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_successes_for_current_minute"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"litellm.router_utils.cooldown_handlers.get_deployment_failures_for_current_minute"
|
||||||
|
)
|
||||||
|
def test_mixed_success_failure(mock_failures, mock_successes, router):
|
||||||
|
# Simulate 3 failures, 7 successes
|
||||||
|
mock_failures.return_value = 3
|
||||||
|
mock_successes.return_value = 7
|
||||||
|
|
||||||
|
should_cooldown = _should_cooldown_deployment(
|
||||||
|
litellm_router_instance=router,
|
||||||
|
deployment="gpt-4--0",
|
||||||
|
exception_status=500,
|
||||||
|
original_exception=Exception("Test error"),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
should_cooldown is False
|
||||||
|
), "Should not cooldown when failure rate is below threshold"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue