feat(slack_alerting.py): refactor region outage alerting to do model based alerting instead

Unable to extract azure region from api base, makes sense to start with model alerting and then move to region
2024-05-24 19:10:33 -07:00 · 2024-05-24 19:10:33 -07:00 · 8dec87425e
commit 8dec87425e
parent 2cdb0584d1
7 changed files with 119 additions and 51 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -23,9 +23,8 @@ import litellm.types.router


 class OutageModel(TypedDict):
-    provider: str
-    region_name: str
-    alerts: List[str]
+    model_id: str
+    alerts: List[int]
    deployment_ids: List[str]
    minor_alert_sent: bool
    major_alert_sent: bool
@ -146,6 +145,7 @@ class SlackAlerting(CustomLogger):
        self.is_running = False
        self.alerting_args = SlackAlertingArgs(**alerting_args)
        self.default_webhook_url = default_webhook_url
+        self.llm_router: Optional[litellm.Router] = None

    def update_values(
        self,
@ -154,6 +154,7 @@ class SlackAlerting(CustomLogger):
        alert_types: Optional[List] = None,
        alert_to_webhook_url: Optional[Dict] = None,
        alerting_args: Optional[Dict] = None,
+        llm_router: Optional[litellm.Router] = None,
    ):
        if alerting is not None:
            self.alerting = alerting
@ -169,6 +170,8 @@ class SlackAlerting(CustomLogger):
                self.alert_to_webhook_url = alert_to_webhook_url
            else:
                self.alert_to_webhook_url.update(alert_to_webhook_url)
+        if llm_router is not None:
+            self.llm_router = llm_router

    async def deployment_in_cooldown(self):
        pass
@ -718,21 +721,42 @@ class SlackAlerting(CustomLogger):
            return
        return

+    def _count_outage_alerts(self, alerts: List[int]) -> str:
+        """
+        Parameters:
+        - alerts: List[int] -> list of error codes (either 408 or 500+)
+
+        Returns:
+        - str -> formatted string. This is an alert message, giving a human-friendly description of the errors.
+        """
+        error_breakdown = {"Timeout Errors": 0, "API Errors": 0, "Unknown Errors": 0}
+        for alert in alerts:
+            if alert == 408:
+                error_breakdown["Timeout Errors"] += 1
+            elif alert >= 500:
+                error_breakdown["API Errors"] += 1
+            else:
+                error_breakdown["Unknown Errors"] += 1
+
+        error_msg = ""
+        for key, value in error_breakdown.items():
+            if value > 0:
+                error_msg += "\n{}: {}\n".format(key, value)
+
+        return error_msg
+
    async def outage_alerts(
        self,
-        provider: str,
-        region_name: str,
        exception: APIError,
        deployment_id: str,
    ) -> None:
        """
-        Send slack alert if provider region (e.g. azure east-us-1) is having an outage (408 or >500 errors).
+        Send slack alert if model is badly configured / having an outage (408, 401, 429, >=500).

-        key = (provider + region)
+        key = model_id

        value = {
-        - provider
-        - region
+        - model_id
        - threshold
        - alerts []
        }
@ -741,23 +765,37 @@ class SlackAlerting(CustomLogger):
        max_alerts_size = 10
        """
        try:
-
-            _id = provider + region_name
-
-            outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id)  # type: ignore
-
+            outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=deployment_id)  # type: ignore
            if (
-                getattr(exception, "status_code", None) is not None
-                and exception.status_code != 408  # type: ignore
-                and exception.status_code < 500  # type: ignore
+                getattr(exception, "status_code", None) is None
+                or (
+                    exception.status_code != 408  # type: ignore
+                    and exception.status_code < 500  # type: ignore
+                )
+                or self.llm_router is None
            ):
                return

+            ### EXTRACT MODEL DETAILS ###
+            deployment = self.llm_router.get_deployment(model_id=deployment_id)
+            if deployment is None:
+                return
+
+            model = deployment.litellm_params.model
+            provider = deployment.litellm_params.custom_llm_provider
+            if provider is None:
+                try:
+                    model, provider, _, _ = litellm.get_llm_provider(model=model)
+                except Exception as e:
+                    provider = ""
+            api_base = litellm.get_api_base(
+                model=model, optional_params=deployment.litellm_params
+            )
+
            if outage_value is None:
                outage_value = OutageModel(
-                    provider=provider,
-                    region_name=region_name,
-                    alerts=[exception.message],
+                    model_id=deployment_id,
+                    alerts=[exception.status_code],  # type: ignore
                    deployment_ids=[deployment_id],
                    minor_alert_sent=False,
                    major_alert_sent=False,
@ -766,25 +804,35 @@ class SlackAlerting(CustomLogger):

                ## add to cache ##
                await self.internal_usage_cache.async_set_cache(
-                    key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl
+                    key=deployment_id,
+                    value=outage_value,
+                    ttl=self.alerting_args.outage_alert_ttl,
                )
                return

-            outage_value["alerts"].append(exception.message)
+            outage_value["alerts"].append(exception.status_code)  # type: ignore
            outage_value["deployment_ids"].append(deployment_id)
            outage_value["last_updated_at"] = time.time()
+
            ## MINOR OUTAGE ALERT SENT ##
            if (
                outage_value["minor_alert_sent"] == False
                and len(outage_value["alerts"])
                >= self.alerting_args.minor_outage_alert_threshold
            ):
-                msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
-                    provider,
-                    region_name,
-                    outage_value["alerts"],
-                    outage_value["last_updated_at"],
-                )
+                msg = f"""\n\n
+*⚠️ Minor Service Outage*
+
+*Model Name:* `{model}`
+*Provider:* `{provider}`
+*API Base:* `{api_base}`
+
+*Errors:*
+{self._count_outage_alerts(alerts=outage_value["alerts"])}
+
+
+*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
+"""
                # send minor alert
                _result_val = self.send_alert(
                    message=msg, level="Medium", alert_type="outage_alerts"
@ -798,12 +846,19 @@ class SlackAlerting(CustomLogger):
                and len(outage_value["alerts"])
                >= self.alerting_args.major_outage_alert_threshold
            ):
-                msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
-                    provider,
-                    region_name,
-                    outage_value["alerts"],
-                    outage_value["last_updated_at"],
-                )
+                msg = f"""\n\n
+*⚠️ Major Service Outage*
+
+*Model Name:* `{model}`
+*Provider:* `{provider}`
+*API Base:* `{api_base}`
+
+*Errors:*
+{self._count_outage_alerts(alerts=outage_value["alerts"])}
+
+
+*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
+"""
                # send minor alert
                await self.send_alert(
                    message=msg, level="High", alert_type="outage_alerts"
@ -812,7 +867,9 @@ class SlackAlerting(CustomLogger):
                outage_value["major_alert_sent"] = True

            ## update cache ##
-            await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
+            await self.internal_usage_cache.async_set_cache(
+                key=deployment_id, value=outage_value
+            )
        except Exception as e:
            pass

@ -1075,8 +1132,6 @@ Model Info:
                        _region_name = ""

                await self.outage_alerts(
-                    provider=kwargs.get("custom_llm_provider", "") or "",
-                    region_name=_region_name,
                    exception=kwargs["exception"],
                    deployment_id=model_id,
                )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -868,6 +868,7 @@ def completion(
            user=user,
            optional_params=optional_params,
            litellm_params=litellm_params,
+            custom_llm_provider=custom_llm_provider,
        )
        if mock_response:
            return mock_completion(
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -24,7 +24,7 @@ litellm_settings:

 general_settings:
  alerting: ["slack"]
-  alerting_args: 
-    report_check_interval: 10
-  enable_jwt_auth: True
+  # alerting_args: 
+  #   report_check_interval: 10
+  # enable_jwt_auth: True

--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -3007,7 +3007,7 @@ class ProxyConfig:
            general_settings["alert_types"] = _general_settings["alert_types"]
            proxy_logging_obj.alert_types = general_settings["alert_types"]
            proxy_logging_obj.slack_alerting_instance.update_values(
-                alert_types=general_settings["alert_types"]
+                alert_types=general_settings["alert_types"], llm_router=llm_router
            )

        if "alert_to_webhook_url" in _general_settings:
@ -3015,7 +3015,8 @@ class ProxyConfig:
                "alert_to_webhook_url"
            ]
            proxy_logging_obj.slack_alerting_instance.update_values(
-                alert_to_webhook_url=general_settings["alert_to_webhook_url"]
+                alert_to_webhook_url=general_settings["alert_to_webhook_url"],
+                llm_router=llm_router,
            )

    async def _update_general_settings(self, db_general_settings: Optional[Json]):
@ -3583,6 +3584,9 @@ async def startup_event():
    ## Error Tracking ##
    error_tracking()

+    ## UPDATE SLACK ALERTING ##
+    proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router)
+
    db_writer_client = HTTPHandler()

    proxy_logging_obj._init_litellm_callbacks()  # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
--- a/litellm/router.py
+++ b/litellm/router.py
@ -3876,13 +3876,13 @@ class Router:
                _api_base = litellm.get_api_base(
                    model=_model_name, optional_params=temp_litellm_params
                )
-                asyncio.create_task(
-                    proxy_logging_obj.slack_alerting_instance.send_alert(
-                        message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
-                        alert_type="cooldown_deployment",
-                        level="Low",
-                    )
-                )
+                # asyncio.create_task(
+                #     proxy_logging_obj.slack_alerting_instance.send_alert(
+                #         message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
+                #         alert_type="cooldown_deployment",
+                #         level="Low",
+                #     )
+                # )
        except Exception as e:
            pass

--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -572,6 +572,8 @@ async def test_outage_alerting_called(
        num_retries=0,
        allowed_fails=100,
    )
+
+    slack_alerting.update_values(llm_router=router)
    with patch.object(
        slack_alerting, "outage_alerts", new=AsyncMock()
    ) as mock_send_alert:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -6286,7 +6286,9 @@ def get_model_region(
    return None


-def get_api_base(model: str, optional_params: dict) -> Optional[str]:
+def get_api_base(
+    model: str, optional_params: Union[dict, LiteLLM_Params]
+) -> Optional[str]:
    """
    Returns the api base used for calling the model.

@ -6306,7 +6308,9 @@ def get_api_base(model: str, optional_params: dict) -> Optional[str]:
    """

    try:
-        if "model" in optional_params:
+        if isinstance(optional_params, LiteLLM_Params):
+            _optional_params = optional_params
+        elif "model" in optional_params:
            _optional_params = LiteLLM_Params(**optional_params)
        else:  # prevent needing to copy and pop the dict
            _optional_params = LiteLLM_Params(
@ -6699,6 +6703,8 @@ def get_llm_provider(
    Returns the provider for a given model name - e.g. 'azure/chatgpt-v-2' -> 'azure'

    For router -> Can also give the whole litellm param dict -> this function will extract the relevant details
+
+    Raises Error - if unable to map model to a provider
    """
    try:
        ## IF LITELLM PARAMS GIVEN ##