mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-28 04:04:31 +00:00
fix(slack_alerting.py): fixes for outage alerting
This commit is contained in:
parent
7368406c24
commit
a8fb4e33d5
2 changed files with 88 additions and 72 deletions
|
@ -740,76 +740,81 @@ class SlackAlerting(CustomLogger):
|
||||||
ttl = 1hr
|
ttl = 1hr
|
||||||
max_alerts_size = 10
|
max_alerts_size = 10
|
||||||
"""
|
"""
|
||||||
_id = provider + region_name
|
try:
|
||||||
|
|
||||||
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
|
_id = provider + region_name
|
||||||
|
|
||||||
if (
|
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
|
||||||
getattr(exception, "status_code", None) is not None
|
|
||||||
and exception.status_code != 408 # type: ignore
|
|
||||||
and exception.status_code < 500 # type: ignore
|
|
||||||
):
|
|
||||||
return
|
|
||||||
|
|
||||||
if outage_value is None:
|
if (
|
||||||
outage_value = OutageModel(
|
getattr(exception, "status_code", None) is not None
|
||||||
provider=provider,
|
and exception.status_code != 408 # type: ignore
|
||||||
region_name=region_name,
|
and exception.status_code < 500 # type: ignore
|
||||||
alerts=[exception.message],
|
):
|
||||||
deployment_ids=[deployment_id],
|
return
|
||||||
minor_alert_sent=False,
|
|
||||||
major_alert_sent=False,
|
|
||||||
last_updated_at=time.time(),
|
|
||||||
)
|
|
||||||
|
|
||||||
## add to cache ##
|
if outage_value is None:
|
||||||
await self.internal_usage_cache.async_set_cache(
|
outage_value = OutageModel(
|
||||||
key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl
|
provider=provider,
|
||||||
)
|
region_name=region_name,
|
||||||
return
|
alerts=[exception.message],
|
||||||
|
deployment_ids=[deployment_id],
|
||||||
|
minor_alert_sent=False,
|
||||||
|
major_alert_sent=False,
|
||||||
|
last_updated_at=time.time(),
|
||||||
|
)
|
||||||
|
|
||||||
outage_value["alerts"].append(exception.message)
|
## add to cache ##
|
||||||
outage_value["deployment_ids"].append(deployment_id)
|
await self.internal_usage_cache.async_set_cache(
|
||||||
outage_value["last_updated_at"] = time.time()
|
key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
## MINOR OUTAGE ALERT SENT ##
|
outage_value["alerts"].append(exception.message)
|
||||||
if (
|
outage_value["deployment_ids"].append(deployment_id)
|
||||||
outage_value["minor_alert_sent"] == False
|
outage_value["last_updated_at"] = time.time()
|
||||||
and len(outage_value["alerts"])
|
## MINOR OUTAGE ALERT SENT ##
|
||||||
> self.alerting_args.minor_outage_alert_threshold
|
if (
|
||||||
):
|
outage_value["minor_alert_sent"] == False
|
||||||
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
and len(outage_value["alerts"])
|
||||||
provider,
|
>= self.alerting_args.minor_outage_alert_threshold
|
||||||
region_name,
|
):
|
||||||
outage_value["alerts"],
|
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
||||||
outage_value["last_updated_at"],
|
provider,
|
||||||
)
|
region_name,
|
||||||
# send minor alert
|
outage_value["alerts"],
|
||||||
_result_val = self.send_alert(
|
outage_value["last_updated_at"],
|
||||||
message=msg, level="Medium", alert_type="outage_alerts"
|
)
|
||||||
)
|
# send minor alert
|
||||||
if _result_val is not None:
|
_result_val = self.send_alert(
|
||||||
await _result_val
|
message=msg, level="Medium", alert_type="outage_alerts"
|
||||||
# set to true
|
)
|
||||||
outage_value["minor_alert_sent"] = True
|
if _result_val is not None:
|
||||||
elif (
|
await _result_val
|
||||||
outage_value["major_alert_sent"] == False
|
# set to true
|
||||||
and len(outage_value["alerts"])
|
outage_value["minor_alert_sent"] = True
|
||||||
> self.alerting_args.major_outage_alert_threshold
|
elif (
|
||||||
):
|
outage_value["major_alert_sent"] == False
|
||||||
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
and len(outage_value["alerts"])
|
||||||
provider,
|
>= self.alerting_args.major_outage_alert_threshold
|
||||||
region_name,
|
):
|
||||||
outage_value["alerts"],
|
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
|
||||||
outage_value["last_updated_at"],
|
provider,
|
||||||
)
|
region_name,
|
||||||
# send minor alert
|
outage_value["alerts"],
|
||||||
await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
|
outage_value["last_updated_at"],
|
||||||
# set to true
|
)
|
||||||
outage_value["major_alert_sent"] = True
|
# send minor alert
|
||||||
|
await self.send_alert(
|
||||||
|
message=msg, level="High", alert_type="outage_alerts"
|
||||||
|
)
|
||||||
|
# set to true
|
||||||
|
outage_value["major_alert_sent"] = True
|
||||||
|
|
||||||
## update cache ##
|
## update cache ##
|
||||||
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
|
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
async def model_added_alert(
|
async def model_added_alert(
|
||||||
self, model_name: str, litellm_model_name: str, passed_model_info: Any
|
self, model_name: str, litellm_model_name: str, passed_model_info: Any
|
||||||
|
|
|
@ -555,16 +555,30 @@ async def test_outage_alerting_called(
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
router = Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": model,
|
||||||
|
"litellm_params": {
|
||||||
|
"model": model,
|
||||||
|
"api_key": os.getenv("AZURE_API_KEY"),
|
||||||
|
"api_base": api_base,
|
||||||
|
"vertex_location": vertex_location,
|
||||||
|
"vertex_project": vertex_project,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
num_retries=0,
|
||||||
|
allowed_fails=100,
|
||||||
|
)
|
||||||
with patch.object(
|
with patch.object(
|
||||||
slack_alerting, "outage_alerts", new=AsyncMock()
|
slack_alerting, "outage_alerts", new=AsyncMock()
|
||||||
) as mock_send_alert:
|
) as mock_send_alert:
|
||||||
try:
|
try:
|
||||||
await litellm.acompletion(
|
await router.acompletion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=[{"role": "user", "content": "Hey!"}],
|
messages=[{"role": "user", "content": "Hey!"}],
|
||||||
api_base=api_base,
|
|
||||||
vertex_location=vertex_location,
|
|
||||||
vertex_project=vertex_project,
|
|
||||||
mock_response=error_to_raise,
|
mock_response=error_to_raise,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -575,17 +589,14 @@ async def test_outage_alerting_called(
|
||||||
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
|
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
|
||||||
for _ in range(3):
|
for _ in range(3):
|
||||||
try:
|
try:
|
||||||
await litellm.acompletion(
|
await router.acompletion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=[{"role": "user", "content": "Hey!"}],
|
messages=[{"role": "user", "content": "Hey!"}],
|
||||||
api_base=api_base,
|
|
||||||
vertex_location=vertex_location,
|
|
||||||
vertex_project=vertex_project,
|
|
||||||
mock_response=error_to_raise,
|
mock_response=error_to_raise,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
await asyncio.sleep(3)
|
||||||
if error_code == 500 or error_code == 408:
|
if error_code == 500 or error_code == 408:
|
||||||
mock_send_alert.assert_called_once()
|
mock_send_alert.assert_called_once()
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue