From 3c4bf5250913bde3686587744a88ac17f21d2de2 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 17 May 2024 18:50:33 -0700 Subject: [PATCH 1/2] feat - read cooldown time from exception header --- litellm/router.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index 6400ff64e2..80f1f900c7 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1923,10 +1923,28 @@ class Router: metadata = kwargs.get("litellm_params", {}).get("metadata", None) _model_info = kwargs.get("litellm_params", {}).get("model_info", {}) + exception_response = getattr(exception, "response", {}) + exception_headers = getattr(exception_response, "headers", None) + _time_to_cooldown = self.cooldown_time + + if exception_headers is not None: + + _time_to_cooldown = ( + litellm.utils._get_retry_after_from_exception_header( + response_headers=exception_headers + ) + ) + + if _time_to_cooldown < 0: + # if the response headers did not read it -> set to default cooldown time + _time_to_cooldown = self.cooldown_time + if isinstance(_model_info, dict): deployment_id = _model_info.get("id", None) self._set_cooldown_deployments( - exception_status=exception_status, deployment=deployment_id + exception_status=exception_status, + deployment=deployment_id, + time_to_cooldown=_time_to_cooldown, ) # setting deployment_id in cooldown deployments if custom_llm_provider: model_name = f"{custom_llm_provider}/{model_name}" @@ -2026,7 +2044,10 @@ class Router: return True def _set_cooldown_deployments( - self, exception_status: Union[str, int], deployment: Optional[str] = None + self, + exception_status: Union[str, int], + deployment: Optional[str] = None, + time_to_cooldown: Optional[float] = None, ): """ Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute @@ -2053,6 +2074,8 @@ class Router: f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" ) cooldown_time = self.cooldown_time or 1 + if time_to_cooldown is not None: + cooldown_time = time_to_cooldown if isinstance(exception_status, str): try: @@ -2090,7 +2113,9 @@ class Router: ) self.send_deployment_cooldown_alert( - deployment_id=deployment, exception_status=exception_status + deployment_id=deployment, + exception_status=exception_status, + cooldown_time=cooldown_time, ) else: self.failed_calls.set_cache( @@ -3751,7 +3776,10 @@ class Router: print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa def send_deployment_cooldown_alert( - self, deployment_id: str, exception_status: Union[str, int] + self, + deployment_id: str, + exception_status: Union[str, int], + cooldown_time: float, ): try: from litellm.proxy.proxy_server import proxy_logging_obj @@ -3775,7 +3803,7 @@ class Router: ) asyncio.create_task( proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down Deployment:\nModel Name: {_model_name}\nAPI Base: {_api_base}\n{self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", + message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nGot exception: `{str(exception_status)}`\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", alert_type="cooldown_deployment", level="Low", ) From cdfa9c92324753e1d25933d3aa0081e2cf4bb435 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 17 May 2024 18:52:45 -0700 Subject: [PATCH 2/2] fix - cooldown based on exception header --- litellm/utils.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index 5d5c2b69c6..5f48d60b80 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8008,11 +8008,8 @@ def _should_retry(status_code: int): return False -def _calculate_retry_after( - remaining_retries: int, - max_retries: int, +def _get_retry_after_from_exception_header( response_headers: Optional[httpx.Headers] = None, - min_timeout: int = 0, ): """ Reimplementation of openai's calculate retry after, since that one can't be imported. @@ -8038,10 +8035,20 @@ def _calculate_retry_after( retry_after = int(retry_date - time.time()) else: retry_after = -1 + return retry_after - except Exception: + except Exception as e: retry_after = -1 + +def _calculate_retry_after( + remaining_retries: int, + max_retries: int, + response_headers: Optional[httpx.Headers] = None, + min_timeout: int = 0, +): + retry_after = _get_retry_after_from_exception_header(response_headers) + # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says. if 0 < retry_after <= 60: return retry_after