mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 11:14:04 +00:00
fix(router.py): disable cooldowns
allow admin to disable model cooldowns
This commit is contained in:
parent
43af7fc985
commit
459d294e21
3 changed files with 88 additions and 13 deletions
|
@ -476,6 +476,15 @@ def mock_completion(
|
||||||
model=model, # type: ignore
|
model=model, # type: ignore
|
||||||
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
|
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
|
||||||
)
|
)
|
||||||
|
elif (
|
||||||
|
isinstance(mock_response, str) and mock_response == "litellm.RateLimitError"
|
||||||
|
):
|
||||||
|
raise litellm.RateLimitError(
|
||||||
|
message="this is a mock rate limit error",
|
||||||
|
status_code=getattr(mock_response, "status_code", 429), # type: ignore
|
||||||
|
llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
time_delay = kwargs.get("mock_delay", None)
|
time_delay = kwargs.get("mock_delay", None)
|
||||||
if time_delay is not None:
|
if time_delay is not None:
|
||||||
time.sleep(time_delay)
|
time.sleep(time_delay)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: "*" # all requests where model not in your config go to this deployment
|
- model_name: claude-3-5-sonnet # all requests where model not in your config go to this deployment
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: "openai/*"
|
model: "openai/*"
|
||||||
|
mock_response: "litellm.RateLimitError"
|
|
@ -156,6 +156,7 @@ class Router:
|
||||||
cooldown_time: Optional[
|
cooldown_time: Optional[
|
||||||
float
|
float
|
||||||
] = None, # (seconds) time to cooldown a deployment after failure
|
] = None, # (seconds) time to cooldown a deployment after failure
|
||||||
|
disable_cooldowns: Optional[bool] = None,
|
||||||
routing_strategy: Literal[
|
routing_strategy: Literal[
|
||||||
"simple-shuffle",
|
"simple-shuffle",
|
||||||
"least-busy",
|
"least-busy",
|
||||||
|
@ -307,6 +308,7 @@ class Router:
|
||||||
|
|
||||||
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
self.allowed_fails = allowed_fails or litellm.allowed_fails
|
||||||
self.cooldown_time = cooldown_time or 60
|
self.cooldown_time = cooldown_time or 60
|
||||||
|
self.disable_cooldowns = disable_cooldowns
|
||||||
self.failed_calls = (
|
self.failed_calls = (
|
||||||
InMemoryCache()
|
InMemoryCache()
|
||||||
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
|
||||||
|
@ -2990,6 +2992,8 @@ class Router:
|
||||||
|
|
||||||
the exception is not one that should be immediately retried (e.g. 401)
|
the exception is not one that should be immediately retried (e.g. 401)
|
||||||
"""
|
"""
|
||||||
|
if self.disable_cooldowns is True:
|
||||||
|
return
|
||||||
|
|
||||||
if deployment is None:
|
if deployment is None:
|
||||||
return
|
return
|
||||||
|
@ -3030,24 +3034,50 @@ class Router:
|
||||||
exception_status = 500
|
exception_status = 500
|
||||||
_should_retry = litellm._should_retry(status_code=exception_status)
|
_should_retry = litellm._should_retry(status_code=exception_status)
|
||||||
|
|
||||||
if updated_fails > allowed_fails or _should_retry == False:
|
if updated_fails > allowed_fails or _should_retry is False:
|
||||||
# get the current cooldown list for that minute
|
# get the current cooldown list for that minute
|
||||||
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
|
||||||
cached_value = self.cache.get_cache(key=cooldown_key)
|
cached_value = self.cache.get_cache(
|
||||||
|
key=cooldown_key
|
||||||
|
) # [(deployment_id, {last_error_str, last_error_status_code})]
|
||||||
|
|
||||||
|
cached_value_deployment_ids = []
|
||||||
|
if (
|
||||||
|
cached_value is not None
|
||||||
|
and isinstance(cached_value, list)
|
||||||
|
and len(cached_value) > 0
|
||||||
|
and isinstance(cached_value[0], tuple)
|
||||||
|
):
|
||||||
|
cached_value_deployment_ids = [cv[0] for cv in cached_value]
|
||||||
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
|
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
|
||||||
# update value
|
# update value
|
||||||
try:
|
if cached_value is not None and len(cached_value_deployment_ids) > 0:
|
||||||
if deployment in cached_value:
|
if deployment in cached_value_deployment_ids:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
cached_value = cached_value + [deployment]
|
cached_value = cached_value + [
|
||||||
|
(
|
||||||
|
deployment,
|
||||||
|
{
|
||||||
|
"Exception Received": str(original_exception),
|
||||||
|
"Status Code": str(exception_status),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(
|
self.cache.set_cache(
|
||||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
)
|
)
|
||||||
except:
|
else:
|
||||||
cached_value = [deployment]
|
cached_value = [
|
||||||
|
(
|
||||||
|
deployment,
|
||||||
|
{
|
||||||
|
"Exception Received": str(original_exception),
|
||||||
|
"Status Code": str(exception_status),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
]
|
||||||
# save updated value
|
# save updated value
|
||||||
self.cache.set_cache(
|
self.cache.set_cache(
|
||||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||||
|
@ -3063,7 +3093,33 @@ class Router:
|
||||||
key=deployment, value=updated_fails, ttl=cooldown_time
|
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _async_get_cooldown_deployments(self):
|
async def _async_get_cooldown_deployments(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Async implementation of '_get_cooldown_deployments'
|
||||||
|
"""
|
||||||
|
dt = get_utc_datetime()
|
||||||
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
# get the current cooldown list for that minute
|
||||||
|
cooldown_key = f"{current_minute}:cooldown_models"
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Return cooldown models
|
||||||
|
# ----------------------
|
||||||
|
cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
|
||||||
|
|
||||||
|
cached_value_deployment_ids = []
|
||||||
|
if (
|
||||||
|
cooldown_models is not None
|
||||||
|
and isinstance(cooldown_models, list)
|
||||||
|
and len(cooldown_models) > 0
|
||||||
|
and isinstance(cooldown_models[0], tuple)
|
||||||
|
):
|
||||||
|
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||||
|
|
||||||
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
|
return cached_value_deployment_ids
|
||||||
|
|
||||||
|
async def _async_get_cooldown_deployments_with_debug_info(self) -> List[tuple]:
|
||||||
"""
|
"""
|
||||||
Async implementation of '_get_cooldown_deployments'
|
Async implementation of '_get_cooldown_deployments'
|
||||||
"""
|
"""
|
||||||
|
@ -3080,7 +3136,7 @@ class Router:
|
||||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
return cooldown_models
|
return cooldown_models
|
||||||
|
|
||||||
def _get_cooldown_deployments(self):
|
def _get_cooldown_deployments(self) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Get the list of models being cooled down for this minute
|
Get the list of models being cooled down for this minute
|
||||||
"""
|
"""
|
||||||
|
@ -3094,8 +3150,17 @@ class Router:
|
||||||
# ----------------------
|
# ----------------------
|
||||||
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
|
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
|
||||||
|
|
||||||
|
cached_value_deployment_ids = []
|
||||||
|
if (
|
||||||
|
cooldown_models is not None
|
||||||
|
and isinstance(cooldown_models, list)
|
||||||
|
and len(cooldown_models) > 0
|
||||||
|
and isinstance(cooldown_models[0], tuple)
|
||||||
|
):
|
||||||
|
cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
|
||||||
|
|
||||||
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
|
||||||
return cooldown_models
|
return cached_value_deployment_ids
|
||||||
|
|
||||||
def _get_healthy_deployments(self, model: str):
|
def _get_healthy_deployments(self, model: str):
|
||||||
_all_deployments: list = []
|
_all_deployments: list = []
|
||||||
|
@ -4713,7 +4778,7 @@ class Router:
|
||||||
if _allowed_model_region is None:
|
if _allowed_model_region is None:
|
||||||
_allowed_model_region = "n/a"
|
_allowed_model_region = "n/a"
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}"
|
f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}, cooldown_list={await self._async_get_cooldown_deployments_with_debug_info()}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue