forked from phoenix/litellm-mirror
[Fix] Router cooldown logic - use % thresholds instead of allowed fails to cooldown deployments (#5698)
* move cooldown logic to it's own helper * add new track deployment metrics folder * increment success, fails for deployment in current minute * fix cooldown logic * fix test_aaarouter_dynamic_cooldown_message_retry_time * fix test_single_deployment_no_cooldowns_test_prod_mock_completion_calls * clean up get from deployment test * fix _async_get_healthy_deployments * add mock InternalServerError * test deployment failing 25% requests * add test_high_traffic_cooldowns_one_bad_deployment * fix vertex load test * add test for rate limit error models in cool down * change default cooldown time * fix cooldown message time * fix cooldown on 429 error * fix doc string for _should_cooldown_deployment * fix sync cooldown logic router
This commit is contained in:
parent
7c2ddba6c6
commit
c8d15544c8
11 changed files with 836 additions and 175 deletions
|
@ -528,6 +528,15 @@ def mock_completion(
|
||||||
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
|
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
|
||||||
model=model,
|
model=model,
|
||||||
)
|
)
|
||||||
|
elif (
|
||||||
|
isinstance(mock_response, str)
|
||||||
|
and mock_response == "litellm.InternalServerError"
|
||||||
|
):
|
||||||
|
raise litellm.InternalServerError(
|
||||||
|
message="this is a mock internal server error",
|
||||||
|
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
elif isinstance(mock_response, str) and mock_response.startswith(
|
elif isinstance(mock_response, str) and mock_response.startswith(
|
||||||
"Exception: content_filter_policy"
|
"Exception: content_filter_policy"
|
||||||
):
|
):
|
||||||
|
|
|
@ -6,6 +6,14 @@ model_list:
|
||||||
vertex_project: "adroit-crow-413218"
|
vertex_project: "adroit-crow-413218"
|
||||||
vertex_location: "us-central1"
|
vertex_location: "us-central1"
|
||||||
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
|
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
|
||||||
|
- model_name: gemini-vision
|
||||||
|
litellm_params:
|
||||||
|
model: vertex_ai/gemini-1.0-pro-vision-001
|
||||||
|
api_base: https://exampleopenaiendpoint-production-c715.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
|
||||||
|
vertex_project: "adroit-crow-413218"
|
||||||
|
vertex_location: "us-central1"
|
||||||
|
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
|
||||||
|
|
||||||
- model_name: fake-azure-endpoint
|
- model_name: fake-azure-endpoint
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/429
|
model: openai/429
|
||||||
|
|
|
@ -54,6 +54,13 @@ from litellm.router_utils.client_initalization_utils import (
|
||||||
)
|
)
|
||||||
from litellm.router_utils.cooldown_cache import CooldownCache
|
from litellm.router_utils.cooldown_cache import CooldownCache
|
||||||
from litellm.router_utils.cooldown_callbacks import router_cooldown_handler
|
from litellm.router_utils.cooldown_callbacks import router_cooldown_handler
|
||||||
|
from litellm.router_utils.cooldown_handlers import (
|
||||||
|
DEFAULT_COOLDOWN_TIME_SECONDS,
|
||||||
|
_async_get_cooldown_deployments,
|
||||||
|
_async_get_cooldown_deployments_with_debug_info,
|
||||||
|
_get_cooldown_deployments,
|
||||||
|
_set_cooldown_deployments,
|
||||||
|
)
|
||||||
from litellm.router_utils.fallback_event_handlers import (
|
from litellm.router_utils.fallback_event_handlers import (
|
||||||
log_failure_fallback_event,
|
log_failure_fallback_event,
|
||||||
log_success_fallback_event,
|
log_success_fallback_event,
|
||||||
|
@ -61,6 +68,10 @@ from litellm.router_utils.fallback_event_handlers import (
|
||||||
run_sync_fallback,
|
run_sync_fallback,
|
||||||
)
|
)
|
||||||
from litellm.router_utils.handle_error import send_llm_exception_alert
|
from litellm.router_utils.handle_error import send_llm_exception_alert
|
||||||
|
from litellm.router_utils.router_callbacks.track_deployment_metrics import (
|
||||||
|
increment_deployment_failures_for_current_minute,
|
||||||
|
increment_deployment_successes_for_current_minute,
|
||||||
|
)
|
||||||
from litellm.scheduler import FlowItem, Scheduler
|
from litellm.scheduler import FlowItem, Scheduler
|
||||||
from litellm.types.llms.openai import (
|
from litellm.types.llms.openai import (
|
||||||
Assistant,
|
Assistant,
|
||||||
|
@ -346,7 +357,7 @@ class Router:
|
||||||
self.allowed_fails = allowed_fails
|
self.allowed_fails = allowed_fails
|
||||||
else:
|
else:
|
||||||
self.allowed_fails = litellm.allowed_fails
|
self.allowed_fails = litellm.allowed_fails
|
||||||
self.cooldown_time = cooldown_time or 60
|
self.cooldown_time = cooldown_time or DEFAULT_COOLDOWN_TIME_SECONDS
|
||||||
self.cooldown_cache = CooldownCache(
|
self.cooldown_cache = CooldownCache(
|
||||||
cache=self.cache, default_cooldown_time=self.cooldown_time
|
cache=self.cache, default_cooldown_time=self.cooldown_time
|
||||||
)
|
)
|
||||||
|
@ -444,6 +455,10 @@ class Router:
|
||||||
litellm._async_success_callback.append(self.deployment_callback_on_success)
|
litellm._async_success_callback.append(self.deployment_callback_on_success)
|
||||||
else:
|
else:
|
||||||
litellm._async_success_callback.append(self.deployment_callback_on_success)
|
litellm._async_success_callback.append(self.deployment_callback_on_success)
|
||||||
|
if isinstance(litellm.success_callback, list):
|
||||||
|
litellm.success_callback.append(self.sync_deployment_callback_on_success)
|
||||||
|
else:
|
||||||
|
litellm.success_callback = [self.sync_deployment_callback_on_success]
|
||||||
## COOLDOWNS ##
|
## COOLDOWNS ##
|
||||||
if isinstance(litellm.failure_callback, list):
|
if isinstance(litellm.failure_callback, list):
|
||||||
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
litellm.failure_callback.append(self.deployment_callback_on_failure)
|
||||||
|
@ -3001,7 +3016,9 @@ class Router:
|
||||||
"litellm.router.py::async_function_with_fallbacks() - Error occurred while trying to do fallbacks - {}\n{}\n\nDebug Information:\nCooldown Deployments={}".format(
|
"litellm.router.py::async_function_with_fallbacks() - Error occurred while trying to do fallbacks - {}\n{}\n\nDebug Information:\nCooldown Deployments={}".format(
|
||||||
str(new_exception),
|
str(new_exception),
|
||||||
traceback.format_exc(),
|
traceback.format_exc(),
|
||||||
await self._async_get_cooldown_deployments_with_debug_info(),
|
await _async_get_cooldown_deployments_with_debug_info(
|
||||||
|
litellm_router_instance=self
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
fallback_failure_exception_str = str(new_exception)
|
fallback_failure_exception_str = str(new_exception)
|
||||||
|
@ -3536,6 +3553,11 @@ class Router:
|
||||||
key=tpm_key, value=total_tokens, ttl=RoutingArgs.ttl.value
|
key=tpm_key, value=total_tokens, ttl=RoutingArgs.ttl.value
|
||||||
)
|
)
|
||||||
|
|
||||||
|
increment_deployment_successes_for_current_minute(
|
||||||
|
litellm_router_instance=self,
|
||||||
|
deployment_id=id,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_router_logger.exception(
|
verbose_router_logger.exception(
|
||||||
"litellm.proxy.hooks.prompt_injection_detection.py::async_pre_call_hook(): Exception occured - {}".format(
|
"litellm.proxy.hooks.prompt_injection_detection.py::async_pre_call_hook(): Exception occured - {}".format(
|
||||||
|
@ -3544,6 +3566,31 @@ class Router:
|
||||||
)
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def sync_deployment_callback_on_success(
|
||||||
|
self,
|
||||||
|
kwargs, # kwargs to completion
|
||||||
|
completion_response, # response from completion
|
||||||
|
start_time,
|
||||||
|
end_time, # start/end time
|
||||||
|
):
|
||||||
|
id = None
|
||||||
|
if kwargs["litellm_params"].get("metadata") is None:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
model_group = kwargs["litellm_params"]["metadata"].get("model_group", None)
|
||||||
|
model_info = kwargs["litellm_params"].get("model_info", {}) or {}
|
||||||
|
id = model_info.get("id", None)
|
||||||
|
if model_group is None or id is None:
|
||||||
|
return
|
||||||
|
elif isinstance(id, int):
|
||||||
|
id = str(id)
|
||||||
|
|
||||||
|
if id is not None:
|
||||||
|
increment_deployment_successes_for_current_minute(
|
||||||
|
litellm_router_instance=self,
|
||||||
|
deployment_id=id,
|
||||||
|
)
|
||||||
|
|
||||||
def deployment_callback_on_failure(
|
def deployment_callback_on_failure(
|
||||||
self,
|
self,
|
||||||
kwargs, # kwargs to completion
|
kwargs, # kwargs to completion
|
||||||
|
@ -3595,7 +3642,12 @@ class Router:
|
||||||
|
|
||||||
if isinstance(_model_info, dict):
|
if isinstance(_model_info, dict):
|
||||||
deployment_id = _model_info.get("id", None)
|
deployment_id = _model_info.get("id", None)
|
||||||
self._set_cooldown_deployments(
|
increment_deployment_failures_for_current_minute(
|
||||||
|
litellm_router_instance=self,
|
||||||
|
deployment_id=deployment_id,
|
||||||
|
)
|
||||||
|
_set_cooldown_deployments(
|
||||||
|
litellm_router_instance=self,
|
||||||
exception_status=exception_status,
|
exception_status=exception_status,
|
||||||
original_exception=exception,
|
original_exception=exception,
|
||||||
deployment=deployment_id,
|
deployment=deployment_id,
|
||||||
|
@ -3753,155 +3805,6 @@ class Router:
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _set_cooldown_deployments(
|
|
||||||
self,
|
|
||||||
original_exception: Any,
|
|
||||||
exception_status: Union[str, int],
|
|
||||||
deployment: Optional[str] = None,
|
|
||||||
time_to_cooldown: Optional[float] = None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
|
||||||
|
|
||||||
or
|
|
||||||
|
|
||||||
the exception is not one that should be immediately retried (e.g. 401)
|
|
||||||
"""
|
|
||||||
if self.disable_cooldowns is True:
|
|
||||||
return
|
|
||||||
|
|
||||||
if deployment is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
if (
|
|
||||||
self._is_cooldown_required(
|
|
||||||
model_id=deployment,
|
|
||||||
exception_status=exception_status,
|
|
||||||
exception_str=str(original_exception),
|
|
||||||
)
|
|
||||||
is False
|
|
||||||
):
|
|
||||||
return
|
|
||||||
|
|
||||||
if deployment in self.provider_default_deployment_ids:
|
|
||||||
return
|
|
||||||
|
|
||||||
_allowed_fails = self.get_allowed_fails_from_policy(
|
|
||||||
exception=original_exception,
|
|
||||||
)
|
|
||||||
|
|
||||||
allowed_fails = (
|
|
||||||
_allowed_fails if _allowed_fails is not None else self.allowed_fails
|
|
||||||
)
|
|
||||||
|
|
||||||
dt = get_utc_datetime()
|
|
||||||
current_minute = dt.strftime("%H-%M")
|
|
||||||
# get current fails for deployment
|
|
||||||
# update the number of failed calls
|
|
||||||
# if it's > allowed fails
|
|
||||||
# cooldown deployment
|
|
||||||
current_fails = self.failed_calls.get_cache(key=deployment) or 0
|
|
||||||
updated_fails = current_fails + 1
|
|
||||||
verbose_router_logger.debug(
|
|
||||||
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {allowed_fails}"
|
|
||||||
)
|
|
||||||
cooldown_time = self.cooldown_time or 1
|
|
||||||
if time_to_cooldown is not None:
|
|
||||||
cooldown_time = time_to_cooldown
|
|
||||||
|
|
||||||
if isinstance(exception_status, str):
|
|
||||||
try:
|
|
||||||
exception_status = int(exception_status)
|
|
||||||
except Exception as e:
|
|
||||||
verbose_router_logger.debug(
|
|
||||||
"Unable to cast exception status to int {}. Defaulting to status=500.".format(
|
|
||||||
exception_status
|
|
||||||
)
|
|
||||||
)
|
|
||||||
exception_status = 500
|
|
||||||
_should_retry = litellm._should_retry(status_code=exception_status)
|
|
||||||
|
|
||||||
if updated_fails > allowed_fails or _should_retry is False:
|
|
||||||
# get the current cooldown list for that minute
|
|
||||||
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
|
|
||||||
# update value
|
|
||||||
self.cooldown_cache.add_deployment_to_cooldown(
|
|
||||||
model_id=deployment,
|
|
||||||
original_exception=original_exception,
|
|
||||||
exception_status=exception_status,
|
|
||||||
cooldown_time=cooldown_time,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Trigger cooldown handler
|
|
||||||
asyncio.create_task(
|
|
||||||
router_cooldown_handler(
|
|
||||||
litellm_router_instance=self,
|
|
||||||
deployment_id=deployment,
|
|
||||||
exception_status=exception_status,
|
|
||||||
cooldown_time=cooldown_time,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.failed_calls.set_cache(
|
|
||||||
key=deployment, value=updated_fails, ttl=cooldown_time
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _async_get_cooldown_deployments(self) -> List[str]:
|
|
||||||
"""
|
|
||||||
Async implementation of '_get_cooldown_deployments'
|
|
||||||
"""
|
|
||||||
model_ids = self.get_model_ids()
|
|
||||||
cooldown_models = await self.cooldown_cache.async_get_active_cooldowns(
|
|
||||||
model_ids=model_ids
|
|
||||||
)
|
|
||||||
|
|
||||||
cached_value_deployment_ids = []
|
|
||||||
if (
|
|
||||||
cooldown_models is not None
|
|
||||||
and isinstance(cooldown_models, list)
|
|
||||||
and len(cooldown_models) > 0
|
|
||||||
and isinstance(cooldown_models[0], tuple)
|
|
||||||
):
|
|
||||||
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
|
||||||
|
|
||||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
|
||||||
return cached_value_deployment_ids
|
|
||||||
|
|
||||||
async def _async_get_cooldown_deployments_with_debug_info(self) -> List[tuple]:
|
|
||||||
"""
|
|
||||||
Async implementation of '_get_cooldown_deployments'
|
|
||||||
"""
|
|
||||||
model_ids = self.get_model_ids()
|
|
||||||
cooldown_models = await self.cooldown_cache.async_get_active_cooldowns(
|
|
||||||
model_ids=model_ids
|
|
||||||
)
|
|
||||||
|
|
||||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
|
||||||
return cooldown_models
|
|
||||||
|
|
||||||
def _get_cooldown_deployments(self) -> List[str]:
|
|
||||||
"""
|
|
||||||
Get the list of models being cooled down for this minute
|
|
||||||
"""
|
|
||||||
# get the current cooldown list for that minute
|
|
||||||
|
|
||||||
# ----------------------
|
|
||||||
# Return cooldown models
|
|
||||||
# ----------------------
|
|
||||||
model_ids = self.get_model_ids()
|
|
||||||
cooldown_models = self.cooldown_cache.get_active_cooldowns(model_ids=model_ids)
|
|
||||||
|
|
||||||
cached_value_deployment_ids = []
|
|
||||||
if (
|
|
||||||
cooldown_models is not None
|
|
||||||
and isinstance(cooldown_models, list)
|
|
||||||
and len(cooldown_models) > 0
|
|
||||||
and isinstance(cooldown_models[0], tuple)
|
|
||||||
):
|
|
||||||
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
|
||||||
|
|
||||||
return cached_value_deployment_ids
|
|
||||||
|
|
||||||
def _get_healthy_deployments(self, model: str):
|
def _get_healthy_deployments(self, model: str):
|
||||||
_all_deployments: list = []
|
_all_deployments: list = []
|
||||||
try:
|
try:
|
||||||
|
@ -3913,7 +3816,7 @@ class Router:
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
unhealthy_deployments = self._get_cooldown_deployments()
|
unhealthy_deployments = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
healthy_deployments: list = []
|
healthy_deployments: list = []
|
||||||
for deployment in _all_deployments:
|
for deployment in _all_deployments:
|
||||||
if deployment["model_info"]["id"] in unhealthy_deployments:
|
if deployment["model_info"]["id"] in unhealthy_deployments:
|
||||||
|
@ -3930,11 +3833,13 @@ class Router:
|
||||||
model=model,
|
model=model,
|
||||||
)
|
)
|
||||||
if type(_all_deployments) == dict:
|
if type(_all_deployments) == dict:
|
||||||
return []
|
return [], _all_deployments
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
unhealthy_deployments = await self._async_get_cooldown_deployments()
|
unhealthy_deployments = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=self
|
||||||
|
)
|
||||||
healthy_deployments: list = []
|
healthy_deployments: list = []
|
||||||
for deployment in _all_deployments:
|
for deployment in _all_deployments:
|
||||||
if deployment["model_info"]["id"] in unhealthy_deployments:
|
if deployment["model_info"]["id"] in unhealthy_deployments:
|
||||||
|
@ -3992,7 +3897,8 @@ class Router:
|
||||||
target=logging_obj.failure_handler,
|
target=logging_obj.failure_handler,
|
||||||
args=(e, traceback.format_exc()),
|
args=(e, traceback.format_exc()),
|
||||||
).start() # log response
|
).start() # log response
|
||||||
self._set_cooldown_deployments(
|
_set_cooldown_deployments(
|
||||||
|
litellm_router_instance=self,
|
||||||
exception_status=e.status_code,
|
exception_status=e.status_code,
|
||||||
original_exception=e,
|
original_exception=e,
|
||||||
deployment=deployment["model_info"]["id"],
|
deployment=deployment["model_info"]["id"],
|
||||||
|
@ -5241,7 +5147,9 @@ class Router:
|
||||||
# filter out the deployments currently cooling down
|
# filter out the deployments currently cooling down
|
||||||
deployments_to_remove = []
|
deployments_to_remove = []
|
||||||
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
||||||
cooldown_deployments = await self._async_get_cooldown_deployments()
|
cooldown_deployments = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=self
|
||||||
|
)
|
||||||
verbose_router_logger.debug(
|
verbose_router_logger.debug(
|
||||||
f"async cooldown deployments: {cooldown_deployments}"
|
f"async cooldown deployments: {cooldown_deployments}"
|
||||||
)
|
)
|
||||||
|
@ -5283,7 +5191,7 @@ class Router:
|
||||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||||
model_ids=model_ids
|
model_ids=model_ids
|
||||||
)
|
)
|
||||||
_cooldown_list = self._get_cooldown_deployments()
|
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
raise RouterRateLimitError(
|
raise RouterRateLimitError(
|
||||||
model=model,
|
model=model,
|
||||||
cooldown_time=_cooldown_time,
|
cooldown_time=_cooldown_time,
|
||||||
|
@ -5398,7 +5306,7 @@ class Router:
|
||||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
_cooldown_time = self.cooldown_cache.get_min_cooldown(
|
||||||
model_ids=model_ids
|
model_ids=model_ids
|
||||||
)
|
)
|
||||||
_cooldown_list = self._get_cooldown_deployments()
|
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
raise RouterRateLimitError(
|
raise RouterRateLimitError(
|
||||||
model=model,
|
model=model,
|
||||||
cooldown_time=_cooldown_time,
|
cooldown_time=_cooldown_time,
|
||||||
|
@ -5456,7 +5364,7 @@ class Router:
|
||||||
# filter out the deployments currently cooling down
|
# filter out the deployments currently cooling down
|
||||||
deployments_to_remove = []
|
deployments_to_remove = []
|
||||||
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
# cooldown_deployments is a list of model_id's cooling down, cooldown_deployments = ["16700539-b3cd-42f4-b426-6a12a1bb706a", "16700539-b3cd-42f4-b426-7899"]
|
||||||
cooldown_deployments = self._get_cooldown_deployments()
|
cooldown_deployments = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
verbose_router_logger.debug(f"cooldown deployments: {cooldown_deployments}")
|
verbose_router_logger.debug(f"cooldown deployments: {cooldown_deployments}")
|
||||||
# Find deployments in model_list whose model_id is cooling down
|
# Find deployments in model_list whose model_id is cooling down
|
||||||
for deployment in healthy_deployments:
|
for deployment in healthy_deployments:
|
||||||
|
@ -5479,7 +5387,7 @@ class Router:
|
||||||
if len(healthy_deployments) == 0:
|
if len(healthy_deployments) == 0:
|
||||||
model_ids = self.get_model_ids(model_name=model)
|
model_ids = self.get_model_ids(model_name=model)
|
||||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
||||||
_cooldown_list = self._get_cooldown_deployments()
|
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
raise RouterRateLimitError(
|
raise RouterRateLimitError(
|
||||||
model=model,
|
model=model,
|
||||||
cooldown_time=_cooldown_time,
|
cooldown_time=_cooldown_time,
|
||||||
|
@ -5588,7 +5496,7 @@ class Router:
|
||||||
)
|
)
|
||||||
model_ids = self.get_model_ids(model_name=model)
|
model_ids = self.get_model_ids(model_name=model)
|
||||||
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
_cooldown_time = self.cooldown_cache.get_min_cooldown(model_ids=model_ids)
|
||||||
_cooldown_list = self._get_cooldown_deployments()
|
_cooldown_list = _get_cooldown_deployments(litellm_router_instance=self)
|
||||||
raise RouterRateLimitError(
|
raise RouterRateLimitError(
|
||||||
model=model,
|
model=model,
|
||||||
cooldown_time=_cooldown_time,
|
cooldown_time=_cooldown_time,
|
||||||
|
|
|
@ -83,7 +83,7 @@ class CooldownCache:
|
||||||
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
||||||
|
|
||||||
# Retrieve the values for the keys using mget
|
# Retrieve the values for the keys using mget
|
||||||
results = await self.cache.async_batch_get_cache(keys=keys)
|
results = await self.cache.async_batch_get_cache(keys=keys) or []
|
||||||
|
|
||||||
active_cooldowns = []
|
active_cooldowns = []
|
||||||
# Process the results
|
# Process the results
|
||||||
|
@ -101,7 +101,7 @@ class CooldownCache:
|
||||||
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
||||||
|
|
||||||
# Retrieve the values for the keys using mget
|
# Retrieve the values for the keys using mget
|
||||||
results = self.cache.batch_get_cache(keys=keys)
|
results = self.cache.batch_get_cache(keys=keys) or []
|
||||||
|
|
||||||
active_cooldowns = []
|
active_cooldowns = []
|
||||||
# Process the results
|
# Process the results
|
||||||
|
@ -119,17 +119,19 @@ class CooldownCache:
|
||||||
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
keys = [f"deployment:{model_id}:cooldown" for model_id in model_ids]
|
||||||
|
|
||||||
# Retrieve the values for the keys using mget
|
# Retrieve the values for the keys using mget
|
||||||
results = self.cache.batch_get_cache(keys=keys)
|
results = self.cache.batch_get_cache(keys=keys) or []
|
||||||
|
|
||||||
min_cooldown_time = self.default_cooldown_time
|
min_cooldown_time: Optional[float] = None
|
||||||
# Process the results
|
# Process the results
|
||||||
for model_id, result in zip(model_ids, results):
|
for model_id, result in zip(model_ids, results):
|
||||||
if result and isinstance(result, dict):
|
if result and isinstance(result, dict):
|
||||||
cooldown_cache_value = CooldownCacheValue(**result) # type: ignore
|
cooldown_cache_value = CooldownCacheValue(**result) # type: ignore
|
||||||
if cooldown_cache_value["cooldown_time"] < min_cooldown_time:
|
if min_cooldown_time is None:
|
||||||
|
min_cooldown_time = cooldown_cache_value["cooldown_time"]
|
||||||
|
elif cooldown_cache_value["cooldown_time"] < min_cooldown_time:
|
||||||
min_cooldown_time = cooldown_cache_value["cooldown_time"]
|
min_cooldown_time = cooldown_cache_value["cooldown_time"]
|
||||||
|
|
||||||
return min_cooldown_time
|
return min_cooldown_time or self.default_cooldown_time
|
||||||
|
|
||||||
|
|
||||||
# Usage example:
|
# Usage example:
|
||||||
|
|
309
litellm/router_utils/cooldown_handlers.py
Normal file
309
litellm/router_utils/cooldown_handlers.py
Normal file
|
@ -0,0 +1,309 @@
|
||||||
|
"""
|
||||||
|
Router cooldown handlers
|
||||||
|
- _set_cooldown_deployments: puts a deployment in the cooldown list
|
||||||
|
- get_cooldown_deployments: returns the list of deployments in the cooldown list
|
||||||
|
- async_get_cooldown_deployments: ASYNC: returns the list of deployments in the cooldown list
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm._logging import verbose_router_logger
|
||||||
|
from litellm.router_utils.cooldown_callbacks import router_cooldown_handler
|
||||||
|
from litellm.utils import get_utc_datetime
|
||||||
|
|
||||||
|
from .router_callbacks.track_deployment_metrics import (
|
||||||
|
get_deployment_failures_for_current_minute,
|
||||||
|
get_deployment_successes_for_current_minute,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.router import Router as _Router
|
||||||
|
|
||||||
|
LitellmRouter = _Router
|
||||||
|
else:
|
||||||
|
LitellmRouter = Any
|
||||||
|
|
||||||
|
DEFAULT_FAILURE_THRESHOLD_PERCENT = (
|
||||||
|
0.5 # default cooldown a deployment if 50% of requests fail in a given minute
|
||||||
|
)
|
||||||
|
DEFAULT_COOLDOWN_TIME_SECONDS = 5
|
||||||
|
|
||||||
|
|
||||||
|
def _should_run_cooldown_logic(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment: Optional[str],
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
original_exception: Any,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Helper that decides if cooldown logic should be run
|
||||||
|
Returns False if cooldown logic should not be run
|
||||||
|
|
||||||
|
Does not run cooldown logic when:
|
||||||
|
- router.disable_cooldowns is True
|
||||||
|
- deployment is None
|
||||||
|
- _is_cooldown_required() returns False
|
||||||
|
- deployment is in litellm_router_instance.provider_default_deployment_ids
|
||||||
|
- exception_status is not one that should be immediately retried (e.g. 401)
|
||||||
|
"""
|
||||||
|
if litellm_router_instance.disable_cooldowns:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if deployment is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not litellm_router_instance._is_cooldown_required(
|
||||||
|
model_id=deployment,
|
||||||
|
exception_status=exception_status,
|
||||||
|
exception_str=str(original_exception),
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if deployment in litellm_router_instance.provider_default_deployment_ids:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _should_cooldown_deployment(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment: str,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
original_exception: Any,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Helper that decides if a deployment should be put in cooldown
|
||||||
|
|
||||||
|
Returns True if the deployment should be put in cooldown
|
||||||
|
Returns False if the deployment should not be put in cooldown
|
||||||
|
|
||||||
|
|
||||||
|
Deployment is put in cooldown when:
|
||||||
|
- v2 logic (Current):
|
||||||
|
cooldown if:
|
||||||
|
- got a 429 error from LLM API
|
||||||
|
- if %fails/%(successes + fails) > ALLOWED_FAILURE_RATE_PER_MINUTE
|
||||||
|
- got 401 Auth error, 404 NotFounder - checked by litellm._should_retry()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
|
||||||
|
"""
|
||||||
|
if litellm_router_instance.allowed_fails_policy is None:
|
||||||
|
num_successes_this_minute = get_deployment_successes_for_current_minute(
|
||||||
|
litellm_router_instance=litellm_router_instance, deployment_id=deployment
|
||||||
|
)
|
||||||
|
num_fails_this_minute = get_deployment_failures_for_current_minute(
|
||||||
|
litellm_router_instance=litellm_router_instance, deployment_id=deployment
|
||||||
|
)
|
||||||
|
|
||||||
|
total_requests_this_minute = num_successes_this_minute + num_fails_this_minute
|
||||||
|
percent_fails = 0.0
|
||||||
|
if total_requests_this_minute > 0:
|
||||||
|
percent_fails = num_fails_this_minute / (
|
||||||
|
num_successes_this_minute + num_fails_this_minute
|
||||||
|
)
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
"percent fails for deployment = %s, percent fails = %s, num successes = %s, num fails = %s",
|
||||||
|
deployment,
|
||||||
|
percent_fails,
|
||||||
|
num_successes_this_minute,
|
||||||
|
num_fails_this_minute,
|
||||||
|
)
|
||||||
|
exception_status_int = cast_exception_status_to_int(exception_status)
|
||||||
|
if exception_status_int == 429:
|
||||||
|
return True
|
||||||
|
elif (
|
||||||
|
total_requests_this_minute == 1
|
||||||
|
): # if the 1st request fails it's not guaranteed that the deployment should be cooled down
|
||||||
|
return False
|
||||||
|
elif percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT:
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif (
|
||||||
|
litellm._should_retry(
|
||||||
|
status_code=cast_exception_status_to_int(exception_status)
|
||||||
|
)
|
||||||
|
is False
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return should_cooldown_based_on_allowed_fails_policy(
|
||||||
|
litellm_router_instance=litellm_router_instance,
|
||||||
|
deployment=deployment,
|
||||||
|
original_exception=original_exception,
|
||||||
|
)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _set_cooldown_deployments(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
original_exception: Any,
|
||||||
|
exception_status: Union[str, int],
|
||||||
|
deployment: Optional[str] = None,
|
||||||
|
time_to_cooldown: Optional[float] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
the exception is not one that should be immediately retried (e.g. 401)
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
_should_run_cooldown_logic(
|
||||||
|
litellm_router_instance, deployment, exception_status, original_exception
|
||||||
|
)
|
||||||
|
is False
|
||||||
|
or deployment is None
|
||||||
|
):
|
||||||
|
return
|
||||||
|
|
||||||
|
exception_status_int = cast_exception_status_to_int(exception_status)
|
||||||
|
|
||||||
|
verbose_router_logger.debug(f"Attempting to add {deployment} to cooldown list")
|
||||||
|
cooldown_time = litellm_router_instance.cooldown_time or 1
|
||||||
|
if time_to_cooldown is not None:
|
||||||
|
cooldown_time = time_to_cooldown
|
||||||
|
|
||||||
|
if _should_cooldown_deployment(
|
||||||
|
litellm_router_instance, deployment, exception_status, original_exception
|
||||||
|
):
|
||||||
|
litellm_router_instance.cooldown_cache.add_deployment_to_cooldown(
|
||||||
|
model_id=deployment,
|
||||||
|
original_exception=original_exception,
|
||||||
|
exception_status=exception_status_int,
|
||||||
|
cooldown_time=cooldown_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Trigger cooldown callback handler
|
||||||
|
asyncio.create_task(
|
||||||
|
router_cooldown_handler(
|
||||||
|
litellm_router_instance=litellm_router_instance,
|
||||||
|
deployment_id=deployment,
|
||||||
|
exception_status=exception_status,
|
||||||
|
cooldown_time=cooldown_time,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Async implementation of '_get_cooldown_deployments'
|
||||||
|
"""
|
||||||
|
model_ids = litellm_router_instance.get_model_ids()
|
||||||
|
cooldown_models = (
|
||||||
|
await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
|
||||||
|
model_ids=model_ids
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
cached_value_deployment_ids = []
|
||||||
|
if (
|
||||||
|
cooldown_models is not None
|
||||||
|
and isinstance(cooldown_models, list)
|
||||||
|
and len(cooldown_models) > 0
|
||||||
|
and isinstance(cooldown_models[0], tuple)
|
||||||
|
):
|
||||||
|
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||||
|
|
||||||
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
|
return cached_value_deployment_ids
|
||||||
|
|
||||||
|
|
||||||
|
async def _async_get_cooldown_deployments_with_debug_info(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
) -> List[tuple]:
|
||||||
|
"""
|
||||||
|
Async implementation of '_get_cooldown_deployments'
|
||||||
|
"""
|
||||||
|
model_ids = litellm_router_instance.get_model_ids()
|
||||||
|
cooldown_models = (
|
||||||
|
await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
|
||||||
|
model_ids=model_ids
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
|
return cooldown_models
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cooldown_deployments(litellm_router_instance: LitellmRouter) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the list of models being cooled down for this minute
|
||||||
|
"""
|
||||||
|
# get the current cooldown list for that minute
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Return cooldown models
|
||||||
|
# ----------------------
|
||||||
|
model_ids = litellm_router_instance.get_model_ids()
|
||||||
|
cooldown_models = litellm_router_instance.cooldown_cache.get_active_cooldowns(
|
||||||
|
model_ids=model_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
cached_value_deployment_ids = []
|
||||||
|
if (
|
||||||
|
cooldown_models is not None
|
||||||
|
and isinstance(cooldown_models, list)
|
||||||
|
and len(cooldown_models) > 0
|
||||||
|
and isinstance(cooldown_models[0], tuple)
|
||||||
|
):
|
||||||
|
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||||
|
|
||||||
|
return cached_value_deployment_ids
|
||||||
|
|
||||||
|
|
||||||
|
def should_cooldown_based_on_allowed_fails_policy(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment: str,
|
||||||
|
original_exception: Any,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Check if fails are within the allowed limit and update the number of fails.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- True if fails exceed the allowed limit (should cooldown)
|
||||||
|
- False if fails are within the allowed limit (should not cooldown)
|
||||||
|
"""
|
||||||
|
allowed_fails = (
|
||||||
|
litellm_router_instance.get_allowed_fails_from_policy(
|
||||||
|
exception=original_exception,
|
||||||
|
)
|
||||||
|
or litellm_router_instance.allowed_fails
|
||||||
|
)
|
||||||
|
cooldown_time = (
|
||||||
|
litellm_router_instance.cooldown_time or DEFAULT_COOLDOWN_TIME_SECONDS
|
||||||
|
)
|
||||||
|
|
||||||
|
current_fails = litellm_router_instance.failed_calls.get_cache(key=deployment) or 0
|
||||||
|
updated_fails = current_fails + 1
|
||||||
|
|
||||||
|
if updated_fails > allowed_fails:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
litellm_router_instance.failed_calls.set_cache(
|
||||||
|
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||||
|
)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def cast_exception_status_to_int(exception_status: Union[str, int]) -> int:
|
||||||
|
if isinstance(exception_status, str):
|
||||||
|
try:
|
||||||
|
exception_status = int(exception_status)
|
||||||
|
except Exception as e:
|
||||||
|
verbose_router_logger.debug(
|
||||||
|
f"Unable to cast exception status to int {exception_status}. Defaulting to status=500."
|
||||||
|
)
|
||||||
|
exception_status = 500
|
||||||
|
return exception_status
|
|
@ -0,0 +1,91 @@
|
||||||
|
"""
|
||||||
|
Helper functions to get/set num success and num failures per deployment
|
||||||
|
|
||||||
|
|
||||||
|
set_deployment_failures_for_current_minute
|
||||||
|
set_deployment_successes_for_current_minute
|
||||||
|
|
||||||
|
get_deployment_failures_for_current_minute
|
||||||
|
get_deployment_successes_for_current_minute
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING, Any, Callable, Optional
|
||||||
|
|
||||||
|
from litellm.utils import get_utc_datetime
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from litellm.router import Router as _Router
|
||||||
|
|
||||||
|
LitellmRouter = _Router
|
||||||
|
else:
|
||||||
|
LitellmRouter = Any
|
||||||
|
|
||||||
|
|
||||||
|
def increment_deployment_successes_for_current_minute(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
In-Memory: Increments the number of successes for the current minute for a deployment_id
|
||||||
|
"""
|
||||||
|
key = f"{deployment_id}:successes"
|
||||||
|
litellm_router_instance.cache.increment_cache(
|
||||||
|
local_only=True,
|
||||||
|
key=key,
|
||||||
|
value=1,
|
||||||
|
ttl=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def increment_deployment_failures_for_current_minute(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
In-Memory: Increments the number of failures for the current minute for a deployment_id
|
||||||
|
"""
|
||||||
|
key = f"{deployment_id}:fails"
|
||||||
|
litellm_router_instance.cache.increment_cache(
|
||||||
|
local_only=True,
|
||||||
|
key=key,
|
||||||
|
value=1,
|
||||||
|
ttl=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_deployment_successes_for_current_minute(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Returns the number of successes for the current minute for a deployment_id
|
||||||
|
|
||||||
|
Returns 0 if no value found
|
||||||
|
"""
|
||||||
|
key = f"{deployment_id}:successes"
|
||||||
|
return (
|
||||||
|
litellm_router_instance.cache.get_cache(
|
||||||
|
local_only=True,
|
||||||
|
key=key,
|
||||||
|
)
|
||||||
|
or 0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_deployment_failures_for_current_minute(
|
||||||
|
litellm_router_instance: LitellmRouter,
|
||||||
|
deployment_id: str,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Returns the number of fails for the current minute for a deployment_id
|
||||||
|
|
||||||
|
Returns 0 if no value found
|
||||||
|
"""
|
||||||
|
key = f"{deployment_id}:fails"
|
||||||
|
return (
|
||||||
|
litellm_router_instance.cache.get_cache(
|
||||||
|
local_only=True,
|
||||||
|
key=key,
|
||||||
|
)
|
||||||
|
or 0
|
||||||
|
)
|
|
@ -28,6 +28,10 @@ from pydantic import BaseModel
|
||||||
import litellm
|
import litellm
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
||||||
|
from litellm.router_utils.cooldown_handlers import (
|
||||||
|
_async_get_cooldown_deployments,
|
||||||
|
_get_cooldown_deployments,
|
||||||
|
)
|
||||||
from litellm.types.router import DeploymentTypedDict
|
from litellm.types.router import DeploymentTypedDict
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
@ -2265,6 +2269,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
||||||
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
{"message": "litellm.proxy.proxy_server.embeddings(): Exception occured - No deployments available for selected model, Try again in 60 seconds. Passed model=text-embedding-ada-002. pre-call-checks=False, allowed_model_region=n/a, cooldown_list=[('b49cbc9314273db7181fe69b1b19993f04efb88f2c1819947c538bac08097e4c', {'Exception Received': 'litellm.RateLimitError: AzureException RateLimitError - Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.', 'Status Code': '429'})]", "level": "ERROR", "timestamp": "2024-08-22T03:25:36.900476"}
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
litellm.set_verbose = True
|
||||||
router = Router(
|
router = Router(
|
||||||
model_list=[
|
model_list=[
|
||||||
{
|
{
|
||||||
|
@ -2279,7 +2284,9 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
||||||
"model": "openai/text-embedding-ada-002",
|
"model": "openai/text-embedding-ada-002",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
]
|
],
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
)
|
)
|
||||||
|
|
||||||
openai_client = openai.OpenAI(api_key="")
|
openai_client = openai.OpenAI(api_key="")
|
||||||
|
@ -2300,7 +2307,7 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
||||||
"create",
|
"create",
|
||||||
side_effect=_return_exception,
|
side_effect=_return_exception,
|
||||||
):
|
):
|
||||||
for _ in range(2):
|
for _ in range(1):
|
||||||
try:
|
try:
|
||||||
if sync_mode:
|
if sync_mode:
|
||||||
router.embedding(
|
router.embedding(
|
||||||
|
@ -2318,9 +2325,13 @@ async def test_aaarouter_dynamic_cooldown_message_retry_time(sync_mode):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if sync_mode:
|
if sync_mode:
|
||||||
cooldown_deployments = router._get_cooldown_deployments()
|
cooldown_deployments = _get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
cooldown_deployments = await router._async_get_cooldown_deployments()
|
cooldown_deployments = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
print(
|
print(
|
||||||
"Cooldown deployments - {}\n{}".format(
|
"Cooldown deployments - {}\n{}".format(
|
||||||
cooldown_deployments, len(cooldown_deployments)
|
cooldown_deployments, len(cooldown_deployments)
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
@ -21,6 +22,7 @@ import openai
|
||||||
import litellm
|
import litellm
|
||||||
from litellm import Router
|
from litellm import Router
|
||||||
from litellm.integrations.custom_logger import CustomLogger
|
from litellm.integrations.custom_logger import CustomLogger
|
||||||
|
from litellm.router_utils.cooldown_handlers import _async_get_cooldown_deployments
|
||||||
from litellm.types.router import DeploymentTypedDict, LiteLLMParamsTypedDict
|
from litellm.types.router import DeploymentTypedDict, LiteLLMParamsTypedDict
|
||||||
|
|
||||||
|
|
||||||
|
@ -239,7 +241,9 @@ async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls():
|
||||||
except litellm.RateLimitError:
|
except litellm.RateLimitError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
cooldown_list = await router._async_get_cooldown_deployments()
|
cooldown_list = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
assert len(cooldown_list) == 0
|
assert len(cooldown_list) == 0
|
||||||
|
|
||||||
healthy_deployments, _ = await router._async_get_healthy_deployments(
|
healthy_deployments, _ = await router._async_get_healthy_deployments(
|
||||||
|
@ -247,3 +251,312 @@ async def test_single_deployment_no_cooldowns_test_prod_mock_completion_calls():
|
||||||
)
|
)
|
||||||
|
|
||||||
print("healthy_deployments: ", healthy_deployments)
|
print("healthy_deployments: ", healthy_deployments)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
E2E - Test router cooldowns
|
||||||
|
|
||||||
|
Test 1: 3 deployments, each deployment fails 25% requests. Assert that no deployments get put into cooldown
|
||||||
|
Test 2: 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
|
||||||
|
Test 3: 3 deployments, 1 deployment has a period of 429 errors. Assert it is put into cooldown and other deployments work
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_high_traffic_cooldowns_all_healthy_deployments():
|
||||||
|
"""
|
||||||
|
PROD TEST - 3 deployments, each deployment fails 25% requests. Assert that no deployments get put into cooldown
|
||||||
|
"""
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
|
)
|
||||||
|
|
||||||
|
all_deployment_ids = router.get_model_ids()
|
||||||
|
|
||||||
|
import random
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Create a defaultdict to track successes and failures for each model ID
|
||||||
|
model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
for _ in range(100):
|
||||||
|
try:
|
||||||
|
model_id = random.choice(all_deployment_ids)
|
||||||
|
|
||||||
|
num_successes = model_stats[model_id]["successes"]
|
||||||
|
num_failures = model_stats[model_id]["failures"]
|
||||||
|
total_requests = num_failures + num_successes
|
||||||
|
if total_requests > 0:
|
||||||
|
print(
|
||||||
|
"num failures= ",
|
||||||
|
num_failures,
|
||||||
|
"num successes= ",
|
||||||
|
num_successes,
|
||||||
|
"num_failures/total = ",
|
||||||
|
num_failures / total_requests,
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_requests == 0:
|
||||||
|
mock_response = "hi"
|
||||||
|
elif num_failures / total_requests <= 0.25:
|
||||||
|
# Randomly decide between fail and succeed
|
||||||
|
if random.random() < 0.5:
|
||||||
|
mock_response = "hi"
|
||||||
|
else:
|
||||||
|
mock_response = "litellm.InternalServerError"
|
||||||
|
else:
|
||||||
|
mock_response = "hi"
|
||||||
|
|
||||||
|
await router.acompletion(
|
||||||
|
model=model_id,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_response=mock_response,
|
||||||
|
)
|
||||||
|
model_stats[model_id]["successes"] += 1
|
||||||
|
|
||||||
|
await asyncio.sleep(0.0001)
|
||||||
|
except litellm.InternalServerError:
|
||||||
|
model_stats[model_id]["failures"] += 1
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed test model stats=", model_stats)
|
||||||
|
raise e
|
||||||
|
print("model_stats: ", model_stats)
|
||||||
|
|
||||||
|
cooldown_list = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
|
assert len(cooldown_list) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_high_traffic_cooldowns_one_bad_deployment():
|
||||||
|
"""
|
||||||
|
PROD TEST - 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
|
||||||
|
"""
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
|
)
|
||||||
|
|
||||||
|
all_deployment_ids = router.get_model_ids()
|
||||||
|
|
||||||
|
import random
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Create a defaultdict to track successes and failures for each model ID
|
||||||
|
model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
|
||||||
|
bad_deployment_id = random.choice(all_deployment_ids)
|
||||||
|
litellm.set_verbose = True
|
||||||
|
for _ in range(100):
|
||||||
|
try:
|
||||||
|
model_id = random.choice(all_deployment_ids)
|
||||||
|
|
||||||
|
num_successes = model_stats[model_id]["successes"]
|
||||||
|
num_failures = model_stats[model_id]["failures"]
|
||||||
|
total_requests = num_failures + num_successes
|
||||||
|
if total_requests > 0:
|
||||||
|
print(
|
||||||
|
"num failures= ",
|
||||||
|
num_failures,
|
||||||
|
"num successes= ",
|
||||||
|
num_successes,
|
||||||
|
"num_failures/total = ",
|
||||||
|
num_failures / total_requests,
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_requests == 0:
|
||||||
|
mock_response = "hi"
|
||||||
|
elif bad_deployment_id == model_id:
|
||||||
|
if num_failures / total_requests <= 0.6:
|
||||||
|
|
||||||
|
mock_response = "litellm.InternalServerError"
|
||||||
|
|
||||||
|
elif num_failures / total_requests <= 0.25:
|
||||||
|
# Randomly decide between fail and succeed
|
||||||
|
if random.random() < 0.5:
|
||||||
|
mock_response = "hi"
|
||||||
|
else:
|
||||||
|
mock_response = "litellm.InternalServerError"
|
||||||
|
else:
|
||||||
|
mock_response = "hi"
|
||||||
|
|
||||||
|
await router.acompletion(
|
||||||
|
model=model_id,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_response=mock_response,
|
||||||
|
)
|
||||||
|
model_stats[model_id]["successes"] += 1
|
||||||
|
|
||||||
|
await asyncio.sleep(0.0001)
|
||||||
|
except litellm.InternalServerError:
|
||||||
|
model_stats[model_id]["failures"] += 1
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed test model stats=", model_stats)
|
||||||
|
raise e
|
||||||
|
print("model_stats: ", model_stats)
|
||||||
|
|
||||||
|
cooldown_list = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
|
assert len(cooldown_list) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_high_traffic_cooldowns_one_rate_limited_deployment():
|
||||||
|
"""
|
||||||
|
PROD TEST - 3 deployments, 1- deployment fails 6/10 requests, assert that bad deployment gets put into cooldown
|
||||||
|
"""
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-3.5-turbo",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"api_base": "https://api.openai.com-3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
set_verbose=True,
|
||||||
|
debug_level="DEBUG",
|
||||||
|
)
|
||||||
|
|
||||||
|
all_deployment_ids = router.get_model_ids()
|
||||||
|
|
||||||
|
import random
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Create a defaultdict to track successes and failures for each model ID
|
||||||
|
model_stats = defaultdict(lambda: {"successes": 0, "failures": 0})
|
||||||
|
bad_deployment_id = random.choice(all_deployment_ids)
|
||||||
|
litellm.set_verbose = True
|
||||||
|
for _ in range(100):
|
||||||
|
try:
|
||||||
|
model_id = random.choice(all_deployment_ids)
|
||||||
|
|
||||||
|
num_successes = model_stats[model_id]["successes"]
|
||||||
|
num_failures = model_stats[model_id]["failures"]
|
||||||
|
total_requests = num_failures + num_successes
|
||||||
|
if total_requests > 0:
|
||||||
|
print(
|
||||||
|
"num failures= ",
|
||||||
|
num_failures,
|
||||||
|
"num successes= ",
|
||||||
|
num_successes,
|
||||||
|
"num_failures/total = ",
|
||||||
|
num_failures / total_requests,
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_requests == 0:
|
||||||
|
mock_response = "hi"
|
||||||
|
elif bad_deployment_id == model_id:
|
||||||
|
if num_failures / total_requests <= 0.6:
|
||||||
|
|
||||||
|
mock_response = "litellm.RateLimitError"
|
||||||
|
|
||||||
|
elif num_failures / total_requests <= 0.25:
|
||||||
|
# Randomly decide between fail and succeed
|
||||||
|
if random.random() < 0.5:
|
||||||
|
mock_response = "hi"
|
||||||
|
else:
|
||||||
|
mock_response = "litellm.InternalServerError"
|
||||||
|
else:
|
||||||
|
mock_response = "hi"
|
||||||
|
|
||||||
|
await router.acompletion(
|
||||||
|
model=model_id,
|
||||||
|
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||||
|
mock_response=mock_response,
|
||||||
|
)
|
||||||
|
model_stats[model_id]["successes"] += 1
|
||||||
|
|
||||||
|
await asyncio.sleep(0.0001)
|
||||||
|
except litellm.InternalServerError:
|
||||||
|
model_stats[model_id]["failures"] += 1
|
||||||
|
pass
|
||||||
|
except litellm.RateLimitError:
|
||||||
|
model_stats[bad_deployment_id]["failures"] += 1
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print("Failed test model stats=", model_stats)
|
||||||
|
raise e
|
||||||
|
print("model_stats: ", model_stats)
|
||||||
|
|
||||||
|
cooldown_list = await _async_get_cooldown_deployments(
|
||||||
|
litellm_router_instance=router
|
||||||
|
)
|
||||||
|
assert len(cooldown_list) == 1
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Unit tests for router set_cooldowns
|
||||||
|
|
||||||
|
1. _set_cooldown_deployments() will cooldown a deployment after it fails 50% requests
|
||||||
|
"""
|
||||||
|
|
|
@ -102,7 +102,7 @@ async def test_router_retries_errors(sync_mode, error_type):
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
router = Router(model_list=model_list, allowed_fails=3)
|
router = Router(model_list=model_list, set_verbose=True, debug_level="DEBUG")
|
||||||
|
|
||||||
customHandler = MyCustomHandler()
|
customHandler = MyCustomHandler()
|
||||||
litellm.callbacks = [customHandler]
|
litellm.callbacks = [customHandler]
|
||||||
|
@ -118,6 +118,12 @@ async def test_router_retries_errors(sync_mode, error_type):
|
||||||
else Exception("Invalid Request")
|
else Exception("Invalid Request")
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
for _ in range(4):
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="azure/gpt-3.5-turbo",
|
||||||
|
messages=messages,
|
||||||
|
mock_response="1st success to ensure deployment is healthy",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if sync_mode:
|
if sync_mode:
|
||||||
|
|
|
@ -5976,6 +5976,10 @@ def check_valid_key(model: str, api_key: str):
|
||||||
|
|
||||||
def _should_retry(status_code: int):
|
def _should_retry(status_code: int):
|
||||||
"""
|
"""
|
||||||
|
Retries on 408, 409, 429 and 500 errors.
|
||||||
|
|
||||||
|
Any client error in the 400-499 range that isn't explicitly handled (such as 400 Bad Request, 401 Unauthorized, 403 Forbidden, 404 Not Found, etc.) would not trigger a retry.
|
||||||
|
|
||||||
Reimplementation of openai's should retry logic, since that one can't be imported.
|
Reimplementation of openai's should retry logic, since that one can't be imported.
|
||||||
https://github.com/openai/openai-python/blob/af67cfab4210d8e497c05390ce14f39105c77519/src/openai/_base_client.py#L639
|
https://github.com/openai/openai-python/blob/af67cfab4210d8e497c05390ce14f39105c77519/src/openai/_base_client.py#L639
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -86,7 +86,7 @@ async def test_vertex_load():
|
||||||
|
|
||||||
# Assert that the average difference is not more than 20%
|
# Assert that the average difference is not more than 20%
|
||||||
assert (
|
assert (
|
||||||
avg_percentage_diff < 20
|
avg_percentage_diff < 25
|
||||||
), f"Average performance difference of {avg_percentage_diff:.2f}% exceeds 20% threshold"
|
), f"Average performance difference of {avg_percentage_diff:.2f}% exceeds 20% threshold"
|
||||||
|
|
||||||
except litellm.Timeout as e:
|
except litellm.Timeout as e:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue