diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 227db079d..a5ae97d41 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -864,27 +864,37 @@ Model Info: async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): """Log deployment latency""" - if "daily_reports" in self.alert_types: - model_id = ( - kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "") - ) - response_s: timedelta = end_time - start_time - - final_value = response_s - total_tokens = 0 - - if isinstance(response_obj, litellm.ModelResponse): - completion_tokens = response_obj.usage.completion_tokens - final_value = float(response_s.total_seconds() / completion_tokens) - - await self.async_update_daily_reports( - DeploymentMetrics( - id=model_id, - failed_request=False, - latency_per_output_token=final_value, - updated_at=litellm.utils.get_utc_datetime(), + try: + if "daily_reports" in self.alert_types: + model_id = ( + kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "") ) + response_s: timedelta = end_time - start_time + + final_value = response_s + total_tokens = 0 + + if isinstance(response_obj, litellm.ModelResponse): + completion_tokens = response_obj.usage.completion_tokens + if completion_tokens is not None and completion_tokens > 0: + final_value = float( + response_s.total_seconds() / completion_tokens + ) + + await self.async_update_daily_reports( + DeploymentMetrics( + id=model_id, + failed_request=False, + latency_per_output_token=final_value, + updated_at=litellm.utils.get_utc_datetime(), + ) + ) + except Exception as e: + verbose_proxy_logger.error( + "[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ", + e, ) + pass async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): """Log failure + deployment latency""" diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 3dac1563c..6b52eaddc 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -131,7 +131,13 @@ class ProxyLogging: alerting_args=alerting_args, ) - if "daily_reports" in self.alert_types: + if ( + self.alerting is not None + and "slack" in self.alerting + and "daily_reports" in self.alert_types + ): + # NOTE: ENSURE we only add callbacks when alerting is on + # We should NOT add callbacks when alerting is off litellm.callbacks.append(self.slack_alerting_instance) # type: ignore if redis_cache is not None: diff --git a/litellm/router.py b/litellm/router.py index a45b9d396..f022a1f14 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1923,10 +1923,28 @@ class Router: metadata = kwargs.get("litellm_params", {}).get("metadata", None) _model_info = kwargs.get("litellm_params", {}).get("model_info", {}) + exception_response = getattr(exception, "response", {}) + exception_headers = getattr(exception_response, "headers", None) + _time_to_cooldown = self.cooldown_time + + if exception_headers is not None: + + _time_to_cooldown = ( + litellm.utils._get_retry_after_from_exception_header( + response_headers=exception_headers + ) + ) + + if _time_to_cooldown < 0: + # if the response headers did not read it -> set to default cooldown time + _time_to_cooldown = self.cooldown_time + if isinstance(_model_info, dict): deployment_id = _model_info.get("id", None) self._set_cooldown_deployments( - exception_status=exception_status, deployment=deployment_id + exception_status=exception_status, + deployment=deployment_id, + time_to_cooldown=_time_to_cooldown, ) # setting deployment_id in cooldown deployments if custom_llm_provider: model_name = f"{custom_llm_provider}/{model_name}" @@ -2026,7 +2044,10 @@ class Router: return True def _set_cooldown_deployments( - self, exception_status: Union[str, int], deployment: Optional[str] = None + self, + exception_status: Union[str, int], + deployment: Optional[str] = None, + time_to_cooldown: Optional[float] = None, ): """ Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute @@ -2053,6 +2074,8 @@ class Router: f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" ) cooldown_time = self.cooldown_time or 1 + if time_to_cooldown is not None: + cooldown_time = time_to_cooldown if isinstance(exception_status, str): try: @@ -2090,7 +2113,9 @@ class Router: ) self.send_deployment_cooldown_alert( - deployment_id=deployment, exception_status=exception_status + deployment_id=deployment, + exception_status=exception_status, + cooldown_time=cooldown_time, ) else: self.failed_calls.set_cache( @@ -3751,7 +3776,10 @@ class Router: print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa def send_deployment_cooldown_alert( - self, deployment_id: str, exception_status: Union[str, int] + self, + deployment_id: str, + exception_status: Union[str, int], + cooldown_time: float, ): try: from litellm.proxy.proxy_server import proxy_logging_obj @@ -3775,7 +3803,7 @@ class Router: ) asyncio.create_task( proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{self.cooldown_time}` seconds\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", + message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", alert_type="cooldown_deployment", level="Low", ) diff --git a/litellm/utils.py b/litellm/utils.py index c12741a97..a265dbce4 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8071,11 +8071,8 @@ def _should_retry(status_code: int): return False -def _calculate_retry_after( - remaining_retries: int, - max_retries: int, +def _get_retry_after_from_exception_header( response_headers: Optional[httpx.Headers] = None, - min_timeout: int = 0, ): """ Reimplementation of openai's calculate retry after, since that one can't be imported. @@ -8101,10 +8098,20 @@ def _calculate_retry_after( retry_after = int(retry_date - time.time()) else: retry_after = -1 + return retry_after - except Exception: + except Exception as e: retry_after = -1 + +def _calculate_retry_after( + remaining_retries: int, + max_retries: int, + response_headers: Optional[httpx.Headers] = None, + min_timeout: int = 0, +): + retry_after = _get_retry_after_from_exception_header(response_headers) + # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says. if 0 < retry_after <= 60: return retry_after