feat(slack_alerting.py): refactor region outage alerting to do model based alerting instead

Unable to extract azure region from api base, makes sense to start with model alerting and then move to region
This commit is contained in:
Krrish Dholakia 2024-05-24 19:10:33 -07:00
parent 2cdb0584d1
commit 8dec87425e
7 changed files with 119 additions and 51 deletions

View file

@ -23,9 +23,8 @@ import litellm.types.router
class OutageModel(TypedDict):
provider: str
region_name: str
alerts: List[str]
model_id: str
alerts: List[int]
deployment_ids: List[str]
minor_alert_sent: bool
major_alert_sent: bool
@ -146,6 +145,7 @@ class SlackAlerting(CustomLogger):
self.is_running = False
self.alerting_args = SlackAlertingArgs(**alerting_args)
self.default_webhook_url = default_webhook_url
self.llm_router: Optional[litellm.Router] = None
def update_values(
self,
@ -154,6 +154,7 @@ class SlackAlerting(CustomLogger):
alert_types: Optional[List] = None,
alert_to_webhook_url: Optional[Dict] = None,
alerting_args: Optional[Dict] = None,
llm_router: Optional[litellm.Router] = None,
):
if alerting is not None:
self.alerting = alerting
@ -169,6 +170,8 @@ class SlackAlerting(CustomLogger):
self.alert_to_webhook_url = alert_to_webhook_url
else:
self.alert_to_webhook_url.update(alert_to_webhook_url)
if llm_router is not None:
self.llm_router = llm_router
async def deployment_in_cooldown(self):
pass
@ -718,21 +721,42 @@ class SlackAlerting(CustomLogger):
return
return
def _count_outage_alerts(self, alerts: List[int]) -> str:
"""
Parameters:
- alerts: List[int] -> list of error codes (either 408 or 500+)
Returns:
- str -> formatted string. This is an alert message, giving a human-friendly description of the errors.
"""
error_breakdown = {"Timeout Errors": 0, "API Errors": 0, "Unknown Errors": 0}
for alert in alerts:
if alert == 408:
error_breakdown["Timeout Errors"] += 1
elif alert >= 500:
error_breakdown["API Errors"] += 1
else:
error_breakdown["Unknown Errors"] += 1
error_msg = ""
for key, value in error_breakdown.items():
if value > 0:
error_msg += "\n{}: {}\n".format(key, value)
return error_msg
async def outage_alerts(
self,
provider: str,
region_name: str,
exception: APIError,
deployment_id: str,
) -> None:
"""
Send slack alert if provider region (e.g. azure east-us-1) is having an outage (408 or >500 errors).
Send slack alert if model is badly configured / having an outage (408, 401, 429, >=500).
key = (provider + region)
key = model_id
value = {
- provider
- region
- model_id
- threshold
- alerts []
}
@ -741,23 +765,37 @@ class SlackAlerting(CustomLogger):
max_alerts_size = 10
"""
try:
_id = provider + region_name
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=deployment_id) # type: ignore
if (
getattr(exception, "status_code", None) is not None
and exception.status_code != 408 # type: ignore
getattr(exception, "status_code", None) is None
or (
exception.status_code != 408 # type: ignore
and exception.status_code < 500 # type: ignore
)
or self.llm_router is None
):
return
### EXTRACT MODEL DETAILS ###
deployment = self.llm_router.get_deployment(model_id=deployment_id)
if deployment is None:
return
model = deployment.litellm_params.model
provider = deployment.litellm_params.custom_llm_provider
if provider is None:
try:
model, provider, _, _ = litellm.get_llm_provider(model=model)
except Exception as e:
provider = ""
api_base = litellm.get_api_base(
model=model, optional_params=deployment.litellm_params
)
if outage_value is None:
outage_value = OutageModel(
provider=provider,
region_name=region_name,
alerts=[exception.message],
model_id=deployment_id,
alerts=[exception.status_code], # type: ignore
deployment_ids=[deployment_id],
minor_alert_sent=False,
major_alert_sent=False,
@ -766,25 +804,35 @@ class SlackAlerting(CustomLogger):
## add to cache ##
await self.internal_usage_cache.async_set_cache(
key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl
key=deployment_id,
value=outage_value,
ttl=self.alerting_args.outage_alert_ttl,
)
return
outage_value["alerts"].append(exception.message)
outage_value["alerts"].append(exception.status_code) # type: ignore
outage_value["deployment_ids"].append(deployment_id)
outage_value["last_updated_at"] = time.time()
## MINOR OUTAGE ALERT SENT ##
if (
outage_value["minor_alert_sent"] == False
and len(outage_value["alerts"])
>= self.alerting_args.minor_outage_alert_threshold
):
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
provider,
region_name,
outage_value["alerts"],
outage_value["last_updated_at"],
)
msg = f"""\n\n
* Minor Service Outage*
*Model Name:* `{model}`
*Provider:* `{provider}`
*API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
# send minor alert
_result_val = self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts"
@ -798,12 +846,19 @@ class SlackAlerting(CustomLogger):
and len(outage_value["alerts"])
>= self.alerting_args.major_outage_alert_threshold
):
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format(
provider,
region_name,
outage_value["alerts"],
outage_value["last_updated_at"],
)
msg = f"""\n\n
* Major Service Outage*
*Model Name:* `{model}`
*Provider:* `{provider}`
*API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
# send minor alert
await self.send_alert(
message=msg, level="High", alert_type="outage_alerts"
@ -812,7 +867,9 @@ class SlackAlerting(CustomLogger):
outage_value["major_alert_sent"] = True
## update cache ##
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value)
await self.internal_usage_cache.async_set_cache(
key=deployment_id, value=outage_value
)
except Exception as e:
pass
@ -1075,8 +1132,6 @@ Model Info:
_region_name = ""
await self.outage_alerts(
provider=kwargs.get("custom_llm_provider", "") or "",
region_name=_region_name,
exception=kwargs["exception"],
deployment_id=model_id,
)

View file

@ -868,6 +868,7 @@ def completion(
user=user,
optional_params=optional_params,
litellm_params=litellm_params,
custom_llm_provider=custom_llm_provider,
)
if mock_response:
return mock_completion(

View file

@ -24,7 +24,7 @@ litellm_settings:
general_settings:
alerting: ["slack"]
alerting_args:
report_check_interval: 10
enable_jwt_auth: True
# alerting_args:
# report_check_interval: 10
# enable_jwt_auth: True

View file

@ -3007,7 +3007,7 @@ class ProxyConfig:
general_settings["alert_types"] = _general_settings["alert_types"]
proxy_logging_obj.alert_types = general_settings["alert_types"]
proxy_logging_obj.slack_alerting_instance.update_values(
alert_types=general_settings["alert_types"]
alert_types=general_settings["alert_types"], llm_router=llm_router
)
if "alert_to_webhook_url" in _general_settings:
@ -3015,7 +3015,8 @@ class ProxyConfig:
"alert_to_webhook_url"
]
proxy_logging_obj.slack_alerting_instance.update_values(
alert_to_webhook_url=general_settings["alert_to_webhook_url"]
alert_to_webhook_url=general_settings["alert_to_webhook_url"],
llm_router=llm_router,
)
async def _update_general_settings(self, db_general_settings: Optional[Json]):
@ -3583,6 +3584,9 @@ async def startup_event():
## Error Tracking ##
error_tracking()
## UPDATE SLACK ALERTING ##
proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router)
db_writer_client = HTTPHandler()
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made

View file

@ -3876,13 +3876,13 @@ class Router:
_api_base = litellm.get_api_base(
model=_model_name, optional_params=temp_litellm_params
)
asyncio.create_task(
proxy_logging_obj.slack_alerting_instance.send_alert(
message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
alert_type="cooldown_deployment",
level="Low",
)
)
# asyncio.create_task(
# proxy_logging_obj.slack_alerting_instance.send_alert(
# message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
# alert_type="cooldown_deployment",
# level="Low",
# )
# )
except Exception as e:
pass

View file

@ -572,6 +572,8 @@ async def test_outage_alerting_called(
num_retries=0,
allowed_fails=100,
)
slack_alerting.update_values(llm_router=router)
with patch.object(
slack_alerting, "outage_alerts", new=AsyncMock()
) as mock_send_alert:

View file

@ -6286,7 +6286,9 @@ def get_model_region(
return None
def get_api_base(model: str, optional_params: dict) -> Optional[str]:
def get_api_base(
model: str, optional_params: Union[dict, LiteLLM_Params]
) -> Optional[str]:
"""
Returns the api base used for calling the model.
@ -6306,7 +6308,9 @@ def get_api_base(model: str, optional_params: dict) -> Optional[str]:
"""
try:
if "model" in optional_params:
if isinstance(optional_params, LiteLLM_Params):
_optional_params = optional_params
elif "model" in optional_params:
_optional_params = LiteLLM_Params(**optional_params)
else: # prevent needing to copy and pop the dict
_optional_params = LiteLLM_Params(
@ -6699,6 +6703,8 @@ def get_llm_provider(
Returns the provider for a given model name - e.g. 'azure/chatgpt-v-2' -> 'azure'
For router -> Can also give the whole litellm param dict -> this function will extract the relevant details
Raises Error - if unable to map model to a provider
"""
try:
## IF LITELLM PARAMS GIVEN ##