fix(slack_alerting.py): fixes for outage alerting

This commit is contained in:
Krrish Dholakia 2024-05-24 17:17:17 -07:00
parent 7368406c24
commit a8fb4e33d5
2 changed files with 88 additions and 72 deletions

View file

@ -740,6 +740,8 @@ class SlackAlerting(CustomLogger):
ttl = 1hr
max_alerts_size = 10
"""
try:
_id = provider + region_name
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
@ -771,12 +773,11 @@ class SlackAlerting(CustomLogger):
outage_value["alerts"].append(exception.message)
outage_value["deployment_ids"].append(deployment_id)
outage_value["last_updated_at"] = time.time()
## MINOR OUTAGE ALERT SENT ##
if (
outage_value["minor_alert_sent"] == False
and len(outage_value["alerts"])
> self.alerting_args.minor_outage_alert_threshold
>= self.alerting_args.minor_outage_alert_threshold
):
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
provider,
@ -795,7 +796,7 @@ class SlackAlerting(CustomLogger):
elif (
outage_value["major_alert_sent"] == False
and len(outage_value["alerts"])
> self.alerting_args.major_outage_alert_threshold
>= self.alerting_args.major_outage_alert_threshold
):
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
provider,
@ -804,12 +805,16 @@ class SlackAlerting(CustomLogger):
outage_value["last_updated_at"],
)
# send minor alert
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
await self.send_alert(
message=msg, level="High", alert_type="outage_alerts"
)
# set to true
outage_value["major_alert_sent"] = True
## update cache ##
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
except Exception as e:
pass
async def model_added_alert(
self, model_name: str, litellm_model_name: str, passed_model_info: Any

View file

@ -555,16 +555,30 @@ async def test_outage_alerting_called(
),
),
)
router = Router(
model_list=[
{
"model_name": model,
"litellm_params": {
"model": model,
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": api_base,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
},
}
],
num_retries=0,
allowed_fails=100,
)
with patch.object(
slack_alerting, "outage_alerts", new=AsyncMock()
) as mock_send_alert:
try:
await litellm.acompletion(
await router.acompletion(
model=model,
messages=[{"role": "user", "content": "Hey!"}],
api_base=api_base,
vertex_location=vertex_location,
vertex_project=vertex_project,
mock_response=error_to_raise,
)
except Exception as e:
@ -575,17 +589,14 @@ async def test_outage_alerting_called(
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
for _ in range(3):
try:
await litellm.acompletion(
await router.acompletion(
model=model,
messages=[{"role": "user", "content": "Hey!"}],
api_base=api_base,
vertex_location=vertex_location,
vertex_project=vertex_project,
mock_response=error_to_raise,
)
except Exception as e:
pass
await asyncio.sleep(3)
if error_code == 500 or error_code == 408:
mock_send_alert.assert_called_once()
else: