Merge branch 'main' into litellm_standardize_slack_exception_msg_format

2025-04-26 03:04:13 +00:00 · 2024-05-20 16:39:41 -07:00 · 2024-05-20 16:39:41 -07:00 · 8413fdf4c7
commit 8413fdf4c7
parent f11de863f6 7fad52b7c8
4 changed files with 81 additions and 30 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -864,27 +864,37 @@ Model Info:

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        """Log deployment latency"""
-        if "daily_reports" in self.alert_types:
-            model_id = (
-                kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
-            )
-            response_s: timedelta = end_time - start_time
-
-            final_value = response_s
-            total_tokens = 0
-
-            if isinstance(response_obj, litellm.ModelResponse):
-                completion_tokens = response_obj.usage.completion_tokens
-                final_value = float(response_s.total_seconds() / completion_tokens)
-
-            await self.async_update_daily_reports(
-                DeploymentMetrics(
-                    id=model_id,
-                    failed_request=False,
-                    latency_per_output_token=final_value,
-                    updated_at=litellm.utils.get_utc_datetime(),
+        try:
+            if "daily_reports" in self.alert_types:
+                model_id = (
+                    kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
                )
+                response_s: timedelta = end_time - start_time
+
+                final_value = response_s
+                total_tokens = 0
+
+                if isinstance(response_obj, litellm.ModelResponse):
+                    completion_tokens = response_obj.usage.completion_tokens
+                    if completion_tokens is not None and completion_tokens > 0:
+                        final_value = float(
+                            response_s.total_seconds() / completion_tokens
+                        )
+
+                await self.async_update_daily_reports(
+                    DeploymentMetrics(
+                        id=model_id,
+                        failed_request=False,
+                        latency_per_output_token=final_value,
+                        updated_at=litellm.utils.get_utc_datetime(),
+                    )
+                )
+        except Exception as e:
+            verbose_proxy_logger.error(
+                "[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ",
+                e,
            )
+            pass

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        """Log failure + deployment latency"""
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -131,7 +131,13 @@ class ProxyLogging:
            alerting_args=alerting_args,
        )

-        if "daily_reports" in self.alert_types:
+        if (
+            self.alerting is not None
+            and "slack" in self.alerting
+            and "daily_reports" in self.alert_types
+        ):
+            # NOTE: ENSURE we only add callbacks when alerting is on
+            # We should NOT add callbacks when alerting is off
            litellm.callbacks.append(self.slack_alerting_instance)  # type: ignore

        if redis_cache is not None:
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1923,10 +1923,28 @@ class Router:
            metadata = kwargs.get("litellm_params", {}).get("metadata", None)
            _model_info = kwargs.get("litellm_params", {}).get("model_info", {})

+            exception_response = getattr(exception, "response", {})
+            exception_headers = getattr(exception_response, "headers", None)
+            _time_to_cooldown = self.cooldown_time
+
+            if exception_headers is not None:
+
+                _time_to_cooldown = (
+                    litellm.utils._get_retry_after_from_exception_header(
+                        response_headers=exception_headers
+                    )
+                )
+
+                if _time_to_cooldown < 0:
+                    # if the response headers did not read it -> set to default cooldown time
+                    _time_to_cooldown = self.cooldown_time
+
            if isinstance(_model_info, dict):
                deployment_id = _model_info.get("id", None)
                self._set_cooldown_deployments(
-                    exception_status=exception_status, deployment=deployment_id
+                    exception_status=exception_status,
+                    deployment=deployment_id,
+                    time_to_cooldown=_time_to_cooldown,
                )  # setting deployment_id in cooldown deployments
            if custom_llm_provider:
                model_name = f"{custom_llm_provider}/{model_name}"
@ -2026,7 +2044,10 @@ class Router:
            return True

    def _set_cooldown_deployments(
-        self, exception_status: Union[str, int], deployment: Optional[str] = None
+        self,
+        exception_status: Union[str, int],
+        deployment: Optional[str] = None,
+        time_to_cooldown: Optional[float] = None,
    ):
        """
        Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
@ -2053,6 +2074,8 @@ class Router:
            f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
        )
        cooldown_time = self.cooldown_time or 1
+        if time_to_cooldown is not None:
+            cooldown_time = time_to_cooldown

        if isinstance(exception_status, str):
            try:
@ -2090,7 +2113,9 @@ class Router:
                )

            self.send_deployment_cooldown_alert(
-                deployment_id=deployment, exception_status=exception_status
+                deployment_id=deployment,
+                exception_status=exception_status,
+                cooldown_time=cooldown_time,
            )
        else:
            self.failed_calls.set_cache(
@ -3751,7 +3776,10 @@ class Router:
        print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n")  # noqa

    def send_deployment_cooldown_alert(
-        self, deployment_id: str, exception_status: Union[str, int]
+        self,
+        deployment_id: str,
+        exception_status: Union[str, int],
+        cooldown_time: float,
    ):
        try:
            from litellm.proxy.proxy_server import proxy_logging_obj
@ -3775,7 +3803,7 @@ class Router:
                )
                asyncio.create_task(
                    proxy_logging_obj.slack_alerting_instance.send_alert(
-                        message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{self.cooldown_time}` seconds\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
+                        message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
                        alert_type="cooldown_deployment",
                        level="Low",
                    )
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -8071,11 +8071,8 @@ def _should_retry(status_code: int):
    return False


-def _calculate_retry_after(
-    remaining_retries: int,
-    max_retries: int,
+def _get_retry_after_from_exception_header(
    response_headers: Optional[httpx.Headers] = None,
-    min_timeout: int = 0,
 ):
    """
    Reimplementation of openai's calculate retry after, since that one can't be imported.
@ -8101,10 +8098,20 @@ def _calculate_retry_after(
                    retry_after = int(retry_date - time.time())
        else:
            retry_after = -1
+        return retry_after

-    except Exception:
+    except Exception as e:
        retry_after = -1

+
+def _calculate_retry_after(
+    remaining_retries: int,
+    max_retries: int,
+    response_headers: Optional[httpx.Headers] = None,
+    min_timeout: int = 0,
+):
+    retry_after = _get_retry_after_from_exception_header(response_headers)
+
    # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says.
    if 0 < retry_after <= 60:
        return retry_after