diff --git a/docs/my-website/docs/proxy/debugging.md b/docs/my-website/docs/proxy/debugging.md
index 571a97c0e..38680982a 100644
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@@ -88,4 +88,31 @@ Expected Output:
```bash
# no info statements
-```
\ No newline at end of file
+```
+
+## Common Errors
+
+1. "No available deployments..."
+
+```
+No deployments available for selected model, Try again in 60 seconds. Passed model=claude-3-5-sonnet. pre-call-checks=False, allowed_model_region=n/a.
+```
+
+This can be caused due to all your models hitting rate limit errors, causing the cooldown to kick in.
+
+How to control this?
+- Adjust the cooldown time
+
+```yaml
+router_settings:
+ cooldown_time: 0 # 👈 KEY CHANGE
+```
+
+- Disable Cooldowns [NOT RECOMMENDED]
+
+```yaml
+router_settings:
+ disable_cooldowns: True
+```
+
+This is not recommended, as it will lead to requests being routed to deployments over their tpm/rpm limit.
\ No newline at end of file
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
index 240e6c8e0..905954e97 100644
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@@ -815,6 +815,35 @@ model_list:
+**Expected Response**
+
+```
+No deployments available for selected model, Try again in 60 seconds. Passed model=claude-3-5-sonnet. pre-call-checks=False, allowed_model_region=n/a.
+```
+
+#### **Disable cooldowns**
+
+
+
+
+
+```python
+from litellm import Router
+
+
+router = Router(..., disable_cooldowns=True)
+```
+
+
+
+```yaml
+router_settings:
+ disable_cooldowns: True
+```
+
+
+
+
### Retries
For both async + sync functions, we support retrying failed requests.
diff --git a/litellm/main.py b/litellm/main.py
index 48d430d52..c48c242ce 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -476,6 +476,15 @@ def mock_completion(
model=model, # type: ignore
request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
)
+ elif (
+ isinstance(mock_response, str) and mock_response == "litellm.RateLimitError"
+ ):
+ raise litellm.RateLimitError(
+ message="this is a mock rate limit error",
+ status_code=getattr(mock_response, "status_code", 429), # type: ignore
+ llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"), # type: ignore
+ model=model,
+ )
time_delay = kwargs.get("mock_delay", None)
if time_delay is not None:
time.sleep(time_delay)
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index f21a9e832..c62fc9944 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,14 +1,5 @@
model_list:
- - model_name: "*" # all requests where model not in your config go to this deployment
+ - model_name: claude-3-5-sonnet # all requests where model not in your config go to this deployment
litellm_params:
- model: "openai/*"
-
-litellm_settings:
- success_callback: ["s3"]
- s3_callback_params:
- s3_bucket_name: my-test-bucket-22-litellm # AWS Bucket Name for S3
- s3_region_name: us-west-2 # AWS Region Name for S3
- s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3
- s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
- s3_path: my-test-path
-
+ model: "openai/*"
+ mock_response: "litellm.RateLimitError"
\ No newline at end of file
diff --git a/litellm/router.py b/litellm/router.py
index ba3f13b8e..0d082de5d 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -156,6 +156,7 @@ class Router:
cooldown_time: Optional[
float
] = None, # (seconds) time to cooldown a deployment after failure
+ disable_cooldowns: Optional[bool] = None,
routing_strategy: Literal[
"simple-shuffle",
"least-busy",
@@ -307,6 +308,7 @@ class Router:
self.allowed_fails = allowed_fails or litellm.allowed_fails
self.cooldown_time = cooldown_time or 60
+ self.disable_cooldowns = disable_cooldowns
self.failed_calls = (
InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@@ -2990,6 +2992,8 @@ class Router:
the exception is not one that should be immediately retried (e.g. 401)
"""
+ if self.disable_cooldowns is True:
+ return
if deployment is None:
return
@@ -3030,24 +3034,50 @@ class Router:
exception_status = 500
_should_retry = litellm._should_retry(status_code=exception_status)
- if updated_fails > allowed_fails or _should_retry == False:
+ if updated_fails > allowed_fails or _should_retry is False:
# get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
- cached_value = self.cache.get_cache(key=cooldown_key)
+ cached_value = self.cache.get_cache(
+ key=cooldown_key
+ ) # [(deployment_id, {last_error_str, last_error_status_code})]
+ cached_value_deployment_ids = []
+ if (
+ cached_value is not None
+ and isinstance(cached_value, list)
+ and len(cached_value) > 0
+ and isinstance(cached_value[0], tuple)
+ ):
+ cached_value_deployment_ids = [cv[0] for cv in cached_value]
verbose_router_logger.debug(f"adding {deployment} to cooldown models")
# update value
- try:
- if deployment in cached_value:
+ if cached_value is not None and len(cached_value_deployment_ids) > 0:
+ if deployment in cached_value_deployment_ids:
pass
else:
- cached_value = cached_value + [deployment]
+ cached_value = cached_value + [
+ (
+ deployment,
+ {
+ "Exception Received": str(original_exception),
+ "Status Code": str(exception_status),
+ },
+ )
+ ]
# save updated value
self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
- except:
- cached_value = [deployment]
+ else:
+ cached_value = [
+ (
+ deployment,
+ {
+ "Exception Received": str(original_exception),
+ "Status Code": str(exception_status),
+ },
+ )
+ ]
# save updated value
self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
@@ -3063,7 +3093,33 @@ class Router:
key=deployment, value=updated_fails, ttl=cooldown_time
)
- async def _async_get_cooldown_deployments(self):
+ async def _async_get_cooldown_deployments(self) -> List[str]:
+ """
+ Async implementation of '_get_cooldown_deployments'
+ """
+ dt = get_utc_datetime()
+ current_minute = dt.strftime("%H-%M")
+ # get the current cooldown list for that minute
+ cooldown_key = f"{current_minute}:cooldown_models"
+
+ # ----------------------
+ # Return cooldown models
+ # ----------------------
+ cooldown_models = await self.cache.async_get_cache(key=cooldown_key) or []
+
+ cached_value_deployment_ids = []
+ if (
+ cooldown_models is not None
+ and isinstance(cooldown_models, list)
+ and len(cooldown_models) > 0
+ and isinstance(cooldown_models[0], tuple)
+ ):
+ cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
+ verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+ return cached_value_deployment_ids
+
+ async def _async_get_cooldown_deployments_with_debug_info(self) -> List[tuple]:
"""
Async implementation of '_get_cooldown_deployments'
"""
@@ -3080,7 +3136,7 @@ class Router:
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
return cooldown_models
- def _get_cooldown_deployments(self):
+ def _get_cooldown_deployments(self) -> List[str]:
"""
Get the list of models being cooled down for this minute
"""
@@ -3094,8 +3150,17 @@ class Router:
# ----------------------
cooldown_models = self.cache.get_cache(key=cooldown_key) or []
+ cached_value_deployment_ids = []
+ if (
+ cooldown_models is not None
+ and isinstance(cooldown_models, list)
+ and len(cooldown_models) > 0
+ and isinstance(cooldown_models[0], tuple)
+ ):
+ cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
- return cooldown_models
+ return cached_value_deployment_ids
def _get_healthy_deployments(self, model: str):
_all_deployments: list = []
@@ -4713,7 +4778,7 @@ class Router:
if _allowed_model_region is None:
_allowed_model_region = "n/a"
raise ValueError(
- f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}"
+ f"{RouterErrors.no_deployments_available.value}, Try again in {self.cooldown_time} seconds. Passed model={model}. pre-call-checks={self.enable_pre_call_checks}, allowed_model_region={_allowed_model_region}, cooldown_list={await self._async_get_cooldown_deployments_with_debug_info()}"
)
if (