diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 2a2ec4ab7..1a8eef24d 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -23,9 +23,8 @@ import litellm.types.router class OutageModel(TypedDict): - provider: str - region_name: str - alerts: List[str] + model_id: str + alerts: List[int] deployment_ids: List[str] minor_alert_sent: bool major_alert_sent: bool @@ -146,6 +145,7 @@ class SlackAlerting(CustomLogger): self.is_running = False self.alerting_args = SlackAlertingArgs(**alerting_args) self.default_webhook_url = default_webhook_url + self.llm_router: Optional[litellm.Router] = None def update_values( self, @@ -154,6 +154,7 @@ class SlackAlerting(CustomLogger): alert_types: Optional[List] = None, alert_to_webhook_url: Optional[Dict] = None, alerting_args: Optional[Dict] = None, + llm_router: Optional[litellm.Router] = None, ): if alerting is not None: self.alerting = alerting @@ -169,6 +170,8 @@ class SlackAlerting(CustomLogger): self.alert_to_webhook_url = alert_to_webhook_url else: self.alert_to_webhook_url.update(alert_to_webhook_url) + if llm_router is not None: + self.llm_router = llm_router async def deployment_in_cooldown(self): pass @@ -718,21 +721,42 @@ class SlackAlerting(CustomLogger): return return + def _count_outage_alerts(self, alerts: List[int]) -> str: + """ + Parameters: + - alerts: List[int] -> list of error codes (either 408 or 500+) + + Returns: + - str -> formatted string. This is an alert message, giving a human-friendly description of the errors. + """ + error_breakdown = {"Timeout Errors": 0, "API Errors": 0, "Unknown Errors": 0} + for alert in alerts: + if alert == 408: + error_breakdown["Timeout Errors"] += 1 + elif alert >= 500: + error_breakdown["API Errors"] += 1 + else: + error_breakdown["Unknown Errors"] += 1 + + error_msg = "" + for key, value in error_breakdown.items(): + if value > 0: + error_msg += "\n{}: {}\n".format(key, value) + + return error_msg + async def outage_alerts( self, - provider: str, - region_name: str, exception: APIError, deployment_id: str, ) -> None: """ - Send slack alert if provider region (e.g. azure east-us-1) is having an outage (408 or >500 errors). + Send slack alert if model is badly configured / having an outage (408, 401, 429, >=500). - key = (provider + region) + key = model_id value = { - - provider - - region + - model_id - threshold - alerts [] } @@ -741,23 +765,37 @@ class SlackAlerting(CustomLogger): max_alerts_size = 10 """ try: - - _id = provider + region_name - - outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore - + outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=deployment_id) # type: ignore if ( - getattr(exception, "status_code", None) is not None - and exception.status_code != 408 # type: ignore - and exception.status_code < 500 # type: ignore + getattr(exception, "status_code", None) is None + or ( + exception.status_code != 408 # type: ignore + and exception.status_code < 500 # type: ignore + ) + or self.llm_router is None ): return + ### EXTRACT MODEL DETAILS ### + deployment = self.llm_router.get_deployment(model_id=deployment_id) + if deployment is None: + return + + model = deployment.litellm_params.model + provider = deployment.litellm_params.custom_llm_provider + if provider is None: + try: + model, provider, _, _ = litellm.get_llm_provider(model=model) + except Exception as e: + provider = "" + api_base = litellm.get_api_base( + model=model, optional_params=deployment.litellm_params + ) + if outage_value is None: outage_value = OutageModel( - provider=provider, - region_name=region_name, - alerts=[exception.message], + model_id=deployment_id, + alerts=[exception.status_code], # type: ignore deployment_ids=[deployment_id], minor_alert_sent=False, major_alert_sent=False, @@ -766,25 +804,35 @@ class SlackAlerting(CustomLogger): ## add to cache ## await self.internal_usage_cache.async_set_cache( - key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl + key=deployment_id, + value=outage_value, + ttl=self.alerting_args.outage_alert_ttl, ) return - outage_value["alerts"].append(exception.message) + outage_value["alerts"].append(exception.status_code) # type: ignore outage_value["deployment_ids"].append(deployment_id) outage_value["last_updated_at"] = time.time() + ## MINOR OUTAGE ALERT SENT ## if ( outage_value["minor_alert_sent"] == False and len(outage_value["alerts"]) >= self.alerting_args.minor_outage_alert_threshold ): - msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( - provider, - region_name, - outage_value["alerts"], - outage_value["last_updated_at"], - ) + msg = f"""\n\n +*⚠️ Minor Service Outage* + +*Model Name:* `{model}` +*Provider:* `{provider}` +*API Base:* `{api_base}` + +*Errors:* +{self._count_outage_alerts(alerts=outage_value["alerts"])} + + +*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n +""" # send minor alert _result_val = self.send_alert( message=msg, level="Medium", alert_type="outage_alerts" @@ -798,12 +846,19 @@ class SlackAlerting(CustomLogger): and len(outage_value["alerts"]) >= self.alerting_args.major_outage_alert_threshold ): - msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( - provider, - region_name, - outage_value["alerts"], - outage_value["last_updated_at"], - ) + msg = f"""\n\n +*⚠️ Major Service Outage* + +*Model Name:* `{model}` +*Provider:* `{provider}` +*API Base:* `{api_base}` + +*Errors:* +{self._count_outage_alerts(alerts=outage_value["alerts"])} + + +*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n +""" # send minor alert await self.send_alert( message=msg, level="High", alert_type="outage_alerts" @@ -812,7 +867,9 @@ class SlackAlerting(CustomLogger): outage_value["major_alert_sent"] = True ## update cache ## - await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) + await self.internal_usage_cache.async_set_cache( + key=deployment_id, value=outage_value + ) except Exception as e: pass @@ -1075,8 +1132,6 @@ Model Info: _region_name = "" await self.outage_alerts( - provider=kwargs.get("custom_llm_provider", "") or "", - region_name=_region_name, exception=kwargs["exception"], deployment_id=model_id, ) diff --git a/litellm/main.py b/litellm/main.py index 9ff474af1..7e6919f0e 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -868,6 +868,7 @@ def completion( user=user, optional_params=optional_params, litellm_params=litellm_params, + custom_llm_provider=custom_llm_provider, ) if mock_response: return mock_completion( diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 105e00e18..290f2fd70 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -24,7 +24,7 @@ litellm_settings: general_settings: alerting: ["slack"] - alerting_args: - report_check_interval: 10 - enable_jwt_auth: True + # alerting_args: + # report_check_interval: 10 + # enable_jwt_auth: True diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index d9c85623a..9a0fe07fe 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3007,7 +3007,7 @@ class ProxyConfig: general_settings["alert_types"] = _general_settings["alert_types"] proxy_logging_obj.alert_types = general_settings["alert_types"] proxy_logging_obj.slack_alerting_instance.update_values( - alert_types=general_settings["alert_types"] + alert_types=general_settings["alert_types"], llm_router=llm_router ) if "alert_to_webhook_url" in _general_settings: @@ -3015,7 +3015,8 @@ class ProxyConfig: "alert_to_webhook_url" ] proxy_logging_obj.slack_alerting_instance.update_values( - alert_to_webhook_url=general_settings["alert_to_webhook_url"] + alert_to_webhook_url=general_settings["alert_to_webhook_url"], + llm_router=llm_router, ) async def _update_general_settings(self, db_general_settings: Optional[Json]): @@ -3583,6 +3584,9 @@ async def startup_event(): ## Error Tracking ## error_tracking() + ## UPDATE SLACK ALERTING ## + proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router) + db_writer_client = HTTPHandler() proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made diff --git a/litellm/router.py b/litellm/router.py index 4c3361051..d9563877c 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -3876,13 +3876,13 @@ class Router: _api_base = litellm.get_api_base( model=_model_name, optional_params=temp_litellm_params ) - asyncio.create_task( - proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", - alert_type="cooldown_deployment", - level="Low", - ) - ) + # asyncio.create_task( + # proxy_logging_obj.slack_alerting_instance.send_alert( + # message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", + # alert_type="cooldown_deployment", + # level="Low", + # ) + # ) except Exception as e: pass diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index 703d26137..b3292904a 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -572,6 +572,8 @@ async def test_outage_alerting_called( num_retries=0, allowed_fails=100, ) + + slack_alerting.update_values(llm_router=router) with patch.object( slack_alerting, "outage_alerts", new=AsyncMock() ) as mock_send_alert: diff --git a/litellm/utils.py b/litellm/utils.py index 1ed70a942..8ef527642 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -6286,7 +6286,9 @@ def get_model_region( return None -def get_api_base(model: str, optional_params: dict) -> Optional[str]: +def get_api_base( + model: str, optional_params: Union[dict, LiteLLM_Params] +) -> Optional[str]: """ Returns the api base used for calling the model. @@ -6306,7 +6308,9 @@ def get_api_base(model: str, optional_params: dict) -> Optional[str]: """ try: - if "model" in optional_params: + if isinstance(optional_params, LiteLLM_Params): + _optional_params = optional_params + elif "model" in optional_params: _optional_params = LiteLLM_Params(**optional_params) else: # prevent needing to copy and pop the dict _optional_params = LiteLLM_Params( @@ -6699,6 +6703,8 @@ def get_llm_provider( Returns the provider for a given model name - e.g. 'azure/chatgpt-v-2' -> 'azure' For router -> Can also give the whole litellm param dict -> this function will extract the relevant details + + Raises Error - if unable to map model to a provider """ try: ## IF LITELLM PARAMS GIVEN ##