feat(proxy_server.py): retry if virtual key is rate limited

currently for chat completions
2025-04-27 11:43:54 +00:00 · 2024-03-05 19:00:03 -08:00 · 2024-03-05 19:00:03 -08:00 · ad55f4dbb5
commit ad55f4dbb5
parent f95458dad8
4 changed files with 57 additions and 1 deletions
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -71,7 +71,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
    ):
        self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
        api_key = user_api_key_dict.api_key
-        max_parallel_requests = user_api_key_dict.max_parallel_requests or sys.maxsize
+        max_parallel_requests = user_api_key_dict.max_parallel_requests
+        if max_parallel_requests is None:
+            max_parallel_requests = sys.maxsize
        tpm_limit = getattr(user_api_key_dict, "tpm_limit", sys.maxsize)
        if tpm_limit is None:
            tpm_limit = sys.maxsize
@ -105,6 +107,10 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
            and rpm_limit == sys.maxsize
        ):
            pass
+        elif max_parallel_requests == 0 or tpm_limit == 0 or rpm_limit == 0:
+            raise HTTPException(
+                status_code=429, detail="Max parallel request limit reached."
+            )
        elif current is None:
            new_val = {
                "current_requests": 1,
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -8,6 +8,7 @@ import hashlib, uuid
 import warnings
 import importlib
 import warnings
+import backoff


 def showwarning(message, category, filename, lineno, file=None, line=None):
@ -2298,6 +2299,11 @@ def parse_cache_control(cache_control):
    return cache_dict


+def on_backoff(details):
+    # The 'tries' key in the details dictionary contains the number of completed tries
+    verbose_proxy_logger.debug(f"Backing off... this was attempt #{details['tries']}")
+
+
@router.on_event("startup")
 async def startup_event():
    global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time, litellm_proxy_admin_name
@ -2613,6 +2619,19 @@ async def completion(
    dependencies=[Depends(user_api_key_auth)],
    tags=["chat/completions"],
 )  # azure compatible endpoint
+@backoff.on_exception(
+    backoff.expo,
+    Exception,  # base exception to catch for the backoff
+    max_tries=litellm.num_retries or 3,  # maximum number of retries
+    max_time=litellm.request_timeout or 60,  # maximum total time to retry for
+    on_backoff=on_backoff,  # specifying the function to call on backoff
+    giveup=lambda e: not (
+        isinstance(e, ProxyException)
+        and getattr(e, "message", None) is not None
+        and isinstance(e.message, str)
+        and "Max parallel request limit reached" in e.message
+    ),  # the result of the logical expression is on the second position
+)
 async def chat_completion(
    request: Request,
    fastapi_response: Response,