mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-28 04:04:31 +00:00
fix(slack_alerting.py): fixes for outage alerting
This commit is contained in:
parent
7368406c24
commit
a8fb4e33d5
2 changed files with 88 additions and 72 deletions
|
@ -740,6 +740,8 @@ class SlackAlerting(CustomLogger):
|
|||
ttl = 1hr
|
||||
max_alerts_size = 10
|
||||
"""
|
||||
try:
|
||||
|
||||
_id = provider + region_name
|
||||
|
||||
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
|
||||
|
@ -771,12 +773,11 @@ class SlackAlerting(CustomLogger):
|
|||
outage_value["alerts"].append(exception.message)
|
||||
outage_value["deployment_ids"].append(deployment_id)
|
||||
outage_value["last_updated_at"] = time.time()
|
||||
|
||||
## MINOR OUTAGE ALERT SENT ##
|
||||
if (
|
||||
outage_value["minor_alert_sent"] == False
|
||||
and len(outage_value["alerts"])
|
||||
> self.alerting_args.minor_outage_alert_threshold
|
||||
>= self.alerting_args.minor_outage_alert_threshold
|
||||
):
|
||||
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
||||
provider,
|
||||
|
@ -795,7 +796,7 @@ class SlackAlerting(CustomLogger):
|
|||
elif (
|
||||
outage_value["major_alert_sent"] == False
|
||||
and len(outage_value["alerts"])
|
||||
> self.alerting_args.major_outage_alert_threshold
|
||||
>= self.alerting_args.major_outage_alert_threshold
|
||||
):
|
||||
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
||||
provider,
|
||||
|
@ -804,12 +805,16 @@ class SlackAlerting(CustomLogger):
|
|||
outage_value["last_updated_at"],
|
||||
)
|
||||
# send minor alert
|
||||
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
|
||||
await self.send_alert(
|
||||
message=msg, level="High", alert_type="outage_alerts"
|
||||
)
|
||||
# set to true
|
||||
outage_value["major_alert_sent"] = True
|
||||
|
||||
## update cache ##
|
||||
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
async def model_added_alert(
|
||||
self, model_name: str, litellm_model_name: str, passed_model_info: Any
|
||||
|
|
|
@ -555,16 +555,30 @@ async def test_outage_alerting_called(
|
|||
),
|
||||
),
|
||||
)
|
||||
|
||||
router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": model,
|
||||
"litellm_params": {
|
||||
"model": model,
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_base": api_base,
|
||||
"vertex_location": vertex_location,
|
||||
"vertex_project": vertex_project,
|
||||
},
|
||||
}
|
||||
],
|
||||
num_retries=0,
|
||||
allowed_fails=100,
|
||||
)
|
||||
with patch.object(
|
||||
slack_alerting, "outage_alerts", new=AsyncMock()
|
||||
) as mock_send_alert:
|
||||
try:
|
||||
await litellm.acompletion(
|
||||
await router.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hey!"}],
|
||||
api_base=api_base,
|
||||
vertex_location=vertex_location,
|
||||
vertex_project=vertex_project,
|
||||
mock_response=error_to_raise,
|
||||
)
|
||||
except Exception as e:
|
||||
|
@ -575,17 +589,14 @@ async def test_outage_alerting_called(
|
|||
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
|
||||
for _ in range(3):
|
||||
try:
|
||||
await litellm.acompletion(
|
||||
await router.acompletion(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hey!"}],
|
||||
api_base=api_base,
|
||||
vertex_location=vertex_location,
|
||||
vertex_project=vertex_project,
|
||||
mock_response=error_to_raise,
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
await asyncio.sleep(3)
|
||||
if error_code == 500 or error_code == 408:
|
||||
mock_send_alert.assert_called_once()
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue