fix(slack_alerting.py): fixes for outage alerting

This commit is contained in:
Krrish Dholakia 2024-05-24 17:17:17 -07:00
parent 7368406c24
commit a8fb4e33d5
2 changed files with 88 additions and 72 deletions

View file

@ -740,76 +740,81 @@ class SlackAlerting(CustomLogger):
ttl = 1hr ttl = 1hr
max_alerts_size = 10 max_alerts_size = 10
""" """
_id = provider + region_name try:
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore _id = provider + region_name
if ( outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
getattr(exception, "status_code", None) is not None
and exception.status_code != 408 # type: ignore
and exception.status_code < 500 # type: ignore
):
return
if outage_value is None: if (
outage_value = OutageModel( getattr(exception, "status_code", None) is not None
provider=provider, and exception.status_code != 408 # type: ignore
region_name=region_name, and exception.status_code < 500 # type: ignore
alerts=[exception.message], ):
deployment_ids=[deployment_id], return
minor_alert_sent=False,
major_alert_sent=False,
last_updated_at=time.time(),
)
## add to cache ## if outage_value is None:
await self.internal_usage_cache.async_set_cache( outage_value = OutageModel(
key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl provider=provider,
) region_name=region_name,
return alerts=[exception.message],
deployment_ids=[deployment_id],
minor_alert_sent=False,
major_alert_sent=False,
last_updated_at=time.time(),
)
outage_value["alerts"].append(exception.message) ## add to cache ##
outage_value["deployment_ids"].append(deployment_id) await self.internal_usage_cache.async_set_cache(
outage_value["last_updated_at"] = time.time() key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl
)
return
## MINOR OUTAGE ALERT SENT ## outage_value["alerts"].append(exception.message)
if ( outage_value["deployment_ids"].append(deployment_id)
outage_value["minor_alert_sent"] == False outage_value["last_updated_at"] = time.time()
and len(outage_value["alerts"]) ## MINOR OUTAGE ALERT SENT ##
> self.alerting_args.minor_outage_alert_threshold if (
): outage_value["minor_alert_sent"] == False
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( and len(outage_value["alerts"])
provider, >= self.alerting_args.minor_outage_alert_threshold
region_name, ):
outage_value["alerts"], msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
outage_value["last_updated_at"], provider,
) region_name,
# send minor alert outage_value["alerts"],
_result_val = self.send_alert( outage_value["last_updated_at"],
message=msg, level="Medium", alert_type="outage_alerts" )
) # send minor alert
if _result_val is not None: _result_val = self.send_alert(
await _result_val message=msg, level="Medium", alert_type="outage_alerts"
# set to true )
outage_value["minor_alert_sent"] = True if _result_val is not None:
elif ( await _result_val
outage_value["major_alert_sent"] == False # set to true
and len(outage_value["alerts"]) outage_value["minor_alert_sent"] = True
> self.alerting_args.major_outage_alert_threshold elif (
): outage_value["major_alert_sent"] == False
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( and len(outage_value["alerts"])
provider, >= self.alerting_args.major_outage_alert_threshold
region_name, ):
outage_value["alerts"], msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
outage_value["last_updated_at"], provider,
) region_name,
# send minor alert outage_value["alerts"],
await self.send_alert(message=msg, level="High", alert_type="outage_alerts") outage_value["last_updated_at"],
# set to true )
outage_value["major_alert_sent"] = True # send minor alert
await self.send_alert(
message=msg, level="High", alert_type="outage_alerts"
)
# set to true
outage_value["major_alert_sent"] = True
## update cache ## ## update cache ##
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
except Exception as e:
pass
async def model_added_alert( async def model_added_alert(
self, model_name: str, litellm_model_name: str, passed_model_info: Any self, model_name: str, litellm_model_name: str, passed_model_info: Any

View file

@ -555,16 +555,30 @@ async def test_outage_alerting_called(
), ),
), ),
) )
router = Router(
model_list=[
{
"model_name": model,
"litellm_params": {
"model": model,
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": api_base,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
},
}
],
num_retries=0,
allowed_fails=100,
)
with patch.object( with patch.object(
slack_alerting, "outage_alerts", new=AsyncMock() slack_alerting, "outage_alerts", new=AsyncMock()
) as mock_send_alert: ) as mock_send_alert:
try: try:
await litellm.acompletion( await router.acompletion(
model=model, model=model,
messages=[{"role": "user", "content": "Hey!"}], messages=[{"role": "user", "content": "Hey!"}],
api_base=api_base,
vertex_location=vertex_location,
vertex_project=vertex_project,
mock_response=error_to_raise, mock_response=error_to_raise,
) )
except Exception as e: except Exception as e:
@ -575,17 +589,14 @@ async def test_outage_alerting_called(
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
for _ in range(3): for _ in range(3):
try: try:
await litellm.acompletion( await router.acompletion(
model=model, model=model,
messages=[{"role": "user", "content": "Hey!"}], messages=[{"role": "user", "content": "Hey!"}],
api_base=api_base,
vertex_location=vertex_location,
vertex_project=vertex_project,
mock_response=error_to_raise, mock_response=error_to_raise,
) )
except Exception as e: except Exception as e:
pass pass
await asyncio.sleep(3)
if error_code == 500 or error_code == 408: if error_code == 500 or error_code == 408:
mock_send_alert.assert_called_once() mock_send_alert.assert_called_once()
else: else: