diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 0cea0fee2b..2a2ec4ab70 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -740,76 +740,81 @@ class SlackAlerting(CustomLogger): ttl = 1hr max_alerts_size = 10 """ - _id = provider + region_name + try: - outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore + _id = provider + region_name - if ( - getattr(exception, "status_code", None) is not None - and exception.status_code != 408 # type: ignore - and exception.status_code < 500 # type: ignore - ): - return + outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore - if outage_value is None: - outage_value = OutageModel( - provider=provider, - region_name=region_name, - alerts=[exception.message], - deployment_ids=[deployment_id], - minor_alert_sent=False, - major_alert_sent=False, - last_updated_at=time.time(), - ) + if ( + getattr(exception, "status_code", None) is not None + and exception.status_code != 408 # type: ignore + and exception.status_code < 500 # type: ignore + ): + return - ## add to cache ## - await self.internal_usage_cache.async_set_cache( - key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl - ) - return + if outage_value is None: + outage_value = OutageModel( + provider=provider, + region_name=region_name, + alerts=[exception.message], + deployment_ids=[deployment_id], + minor_alert_sent=False, + major_alert_sent=False, + last_updated_at=time.time(), + ) - outage_value["alerts"].append(exception.message) - outage_value["deployment_ids"].append(deployment_id) - outage_value["last_updated_at"] = time.time() + ## add to cache ## + await self.internal_usage_cache.async_set_cache( + key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl + ) + return - ## MINOR OUTAGE ALERT SENT ## - if ( - outage_value["minor_alert_sent"] == False - and len(outage_value["alerts"]) - > self.alerting_args.minor_outage_alert_threshold - ): - msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( - provider, - region_name, - outage_value["alerts"], - outage_value["last_updated_at"], - ) - # send minor alert - _result_val = self.send_alert( - message=msg, level="Medium", alert_type="outage_alerts" - ) - if _result_val is not None: - await _result_val - # set to true - outage_value["minor_alert_sent"] = True - elif ( - outage_value["major_alert_sent"] == False - and len(outage_value["alerts"]) - > self.alerting_args.major_outage_alert_threshold - ): - msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( - provider, - region_name, - outage_value["alerts"], - outage_value["last_updated_at"], - ) - # send minor alert - await self.send_alert(message=msg, level="High", alert_type="outage_alerts") - # set to true - outage_value["major_alert_sent"] = True + outage_value["alerts"].append(exception.message) + outage_value["deployment_ids"].append(deployment_id) + outage_value["last_updated_at"] = time.time() + ## MINOR OUTAGE ALERT SENT ## + if ( + outage_value["minor_alert_sent"] == False + and len(outage_value["alerts"]) + >= self.alerting_args.minor_outage_alert_threshold + ): + msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( + provider, + region_name, + outage_value["alerts"], + outage_value["last_updated_at"], + ) + # send minor alert + _result_val = self.send_alert( + message=msg, level="Medium", alert_type="outage_alerts" + ) + if _result_val is not None: + await _result_val + # set to true + outage_value["minor_alert_sent"] = True + elif ( + outage_value["major_alert_sent"] == False + and len(outage_value["alerts"]) + >= self.alerting_args.major_outage_alert_threshold + ): + msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( + provider, + region_name, + outage_value["alerts"], + outage_value["last_updated_at"], + ) + # send minor alert + await self.send_alert( + message=msg, level="High", alert_type="outage_alerts" + ) + # set to true + outage_value["major_alert_sent"] = True - ## update cache ## - await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) + ## update cache ## + await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) + except Exception as e: + pass async def model_added_alert( self, model_name: str, litellm_model_name: str, passed_model_info: Any diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index 7a4214bbd8..703d261370 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -555,16 +555,30 @@ async def test_outage_alerting_called( ), ), ) + + router = Router( + model_list=[ + { + "model_name": model, + "litellm_params": { + "model": model, + "api_key": os.getenv("AZURE_API_KEY"), + "api_base": api_base, + "vertex_location": vertex_location, + "vertex_project": vertex_project, + }, + } + ], + num_retries=0, + allowed_fails=100, + ) with patch.object( slack_alerting, "outage_alerts", new=AsyncMock() ) as mock_send_alert: try: - await litellm.acompletion( + await router.acompletion( model=model, messages=[{"role": "user", "content": "Hey!"}], - api_base=api_base, - vertex_location=vertex_location, - vertex_project=vertex_project, mock_response=error_to_raise, ) except Exception as e: @@ -575,17 +589,14 @@ async def test_outage_alerting_called( with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: for _ in range(3): try: - await litellm.acompletion( + await router.acompletion( model=model, messages=[{"role": "user", "content": "Hey!"}], - api_base=api_base, - vertex_location=vertex_location, - vertex_project=vertex_project, mock_response=error_to_raise, ) except Exception as e: pass - + await asyncio.sleep(3) if error_code == 500 or error_code == 408: mock_send_alert.assert_called_once() else: