feat(slack_alerting.py): refactor region outage alerting to do model based alerting instead

Unable to extract azure region from api base, makes sense to start with model alerting and then move to region
This commit is contained in:
Krrish Dholakia 2024-05-24 19:10:33 -07:00
parent 2cdb0584d1
commit 8dec87425e
7 changed files with 119 additions and 51 deletions

View file

@ -23,9 +23,8 @@ import litellm.types.router
class OutageModel(TypedDict): class OutageModel(TypedDict):
provider: str model_id: str
region_name: str alerts: List[int]
alerts: List[str]
deployment_ids: List[str] deployment_ids: List[str]
minor_alert_sent: bool minor_alert_sent: bool
major_alert_sent: bool major_alert_sent: bool
@ -146,6 +145,7 @@ class SlackAlerting(CustomLogger):
self.is_running = False self.is_running = False
self.alerting_args = SlackAlertingArgs(**alerting_args) self.alerting_args = SlackAlertingArgs(**alerting_args)
self.default_webhook_url = default_webhook_url self.default_webhook_url = default_webhook_url
self.llm_router: Optional[litellm.Router] = None
def update_values( def update_values(
self, self,
@ -154,6 +154,7 @@ class SlackAlerting(CustomLogger):
alert_types: Optional[List] = None, alert_types: Optional[List] = None,
alert_to_webhook_url: Optional[Dict] = None, alert_to_webhook_url: Optional[Dict] = None,
alerting_args: Optional[Dict] = None, alerting_args: Optional[Dict] = None,
llm_router: Optional[litellm.Router] = None,
): ):
if alerting is not None: if alerting is not None:
self.alerting = alerting self.alerting = alerting
@ -169,6 +170,8 @@ class SlackAlerting(CustomLogger):
self.alert_to_webhook_url = alert_to_webhook_url self.alert_to_webhook_url = alert_to_webhook_url
else: else:
self.alert_to_webhook_url.update(alert_to_webhook_url) self.alert_to_webhook_url.update(alert_to_webhook_url)
if llm_router is not None:
self.llm_router = llm_router
async def deployment_in_cooldown(self): async def deployment_in_cooldown(self):
pass pass
@ -718,21 +721,42 @@ class SlackAlerting(CustomLogger):
return return
return return
def _count_outage_alerts(self, alerts: List[int]) -> str:
"""
Parameters:
- alerts: List[int] -> list of error codes (either 408 or 500+)
Returns:
- str -> formatted string. This is an alert message, giving a human-friendly description of the errors.
"""
error_breakdown = {"Timeout Errors": 0, "API Errors": 0, "Unknown Errors": 0}
for alert in alerts:
if alert == 408:
error_breakdown["Timeout Errors"] += 1
elif alert >= 500:
error_breakdown["API Errors"] += 1
else:
error_breakdown["Unknown Errors"] += 1
error_msg = ""
for key, value in error_breakdown.items():
if value > 0:
error_msg += "\n{}: {}\n".format(key, value)
return error_msg
async def outage_alerts( async def outage_alerts(
self, self,
provider: str,
region_name: str,
exception: APIError, exception: APIError,
deployment_id: str, deployment_id: str,
) -> None: ) -> None:
""" """
Send slack alert if provider region (e.g. azure east-us-1) is having an outage (408 or >500 errors). Send slack alert if model is badly configured / having an outage (408, 401, 429, >=500).
key = (provider + region) key = model_id
value = { value = {
- provider - model_id
- region
- threshold - threshold
- alerts [] - alerts []
} }
@ -741,23 +765,37 @@ class SlackAlerting(CustomLogger):
max_alerts_size = 10 max_alerts_size = 10
""" """
try: try:
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=deployment_id) # type: ignore
_id = provider + region_name
outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore
if ( if (
getattr(exception, "status_code", None) is not None getattr(exception, "status_code", None) is None
and exception.status_code != 408 # type: ignore or (
and exception.status_code < 500 # type: ignore exception.status_code != 408 # type: ignore
and exception.status_code < 500 # type: ignore
)
or self.llm_router is None
): ):
return return
### EXTRACT MODEL DETAILS ###
deployment = self.llm_router.get_deployment(model_id=deployment_id)
if deployment is None:
return
model = deployment.litellm_params.model
provider = deployment.litellm_params.custom_llm_provider
if provider is None:
try:
model, provider, _, _ = litellm.get_llm_provider(model=model)
except Exception as e:
provider = ""
api_base = litellm.get_api_base(
model=model, optional_params=deployment.litellm_params
)
if outage_value is None: if outage_value is None:
outage_value = OutageModel( outage_value = OutageModel(
provider=provider, model_id=deployment_id,
region_name=region_name, alerts=[exception.status_code], # type: ignore
alerts=[exception.message],
deployment_ids=[deployment_id], deployment_ids=[deployment_id],
minor_alert_sent=False, minor_alert_sent=False,
major_alert_sent=False, major_alert_sent=False,
@ -766,25 +804,35 @@ class SlackAlerting(CustomLogger):
## add to cache ## ## add to cache ##
await self.internal_usage_cache.async_set_cache( await self.internal_usage_cache.async_set_cache(
key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl key=deployment_id,
value=outage_value,
ttl=self.alerting_args.outage_alert_ttl,
) )
return return
outage_value["alerts"].append(exception.message) outage_value["alerts"].append(exception.status_code) # type: ignore
outage_value["deployment_ids"].append(deployment_id) outage_value["deployment_ids"].append(deployment_id)
outage_value["last_updated_at"] = time.time() outage_value["last_updated_at"] = time.time()
## MINOR OUTAGE ALERT SENT ## ## MINOR OUTAGE ALERT SENT ##
if ( if (
outage_value["minor_alert_sent"] == False outage_value["minor_alert_sent"] == False
and len(outage_value["alerts"]) and len(outage_value["alerts"])
>= self.alerting_args.minor_outage_alert_threshold >= self.alerting_args.minor_outage_alert_threshold
): ):
msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( msg = f"""\n\n
provider, * Minor Service Outage*
region_name,
outage_value["alerts"], *Model Name:* `{model}`
outage_value["last_updated_at"], *Provider:* `{provider}`
) *API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
# send minor alert # send minor alert
_result_val = self.send_alert( _result_val = self.send_alert(
message=msg, level="Medium", alert_type="outage_alerts" message=msg, level="Medium", alert_type="outage_alerts"
@ -798,12 +846,19 @@ class SlackAlerting(CustomLogger):
and len(outage_value["alerts"]) and len(outage_value["alerts"])
>= self.alerting_args.major_outage_alert_threshold >= self.alerting_args.major_outage_alert_threshold
): ):
msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( msg = f"""\n\n
provider, * Major Service Outage*
region_name,
outage_value["alerts"], *Model Name:* `{model}`
outage_value["last_updated_at"], *Provider:* `{provider}`
) *API Base:* `{api_base}`
*Errors:*
{self._count_outage_alerts(alerts=outage_value["alerts"])}
*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
"""
# send minor alert # send minor alert
await self.send_alert( await self.send_alert(
message=msg, level="High", alert_type="outage_alerts" message=msg, level="High", alert_type="outage_alerts"
@ -812,7 +867,9 @@ class SlackAlerting(CustomLogger):
outage_value["major_alert_sent"] = True outage_value["major_alert_sent"] = True
## update cache ## ## update cache ##
await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) await self.internal_usage_cache.async_set_cache(
key=deployment_id, value=outage_value
)
except Exception as e: except Exception as e:
pass pass
@ -1075,8 +1132,6 @@ Model Info:
_region_name = "" _region_name = ""
await self.outage_alerts( await self.outage_alerts(
provider=kwargs.get("custom_llm_provider", "") or "",
region_name=_region_name,
exception=kwargs["exception"], exception=kwargs["exception"],
deployment_id=model_id, deployment_id=model_id,
) )

View file

@ -868,6 +868,7 @@ def completion(
user=user, user=user,
optional_params=optional_params, optional_params=optional_params,
litellm_params=litellm_params, litellm_params=litellm_params,
custom_llm_provider=custom_llm_provider,
) )
if mock_response: if mock_response:
return mock_completion( return mock_completion(

View file

@ -24,7 +24,7 @@ litellm_settings:
general_settings: general_settings:
alerting: ["slack"] alerting: ["slack"]
alerting_args: # alerting_args:
report_check_interval: 10 # report_check_interval: 10
enable_jwt_auth: True # enable_jwt_auth: True

View file

@ -3007,7 +3007,7 @@ class ProxyConfig:
general_settings["alert_types"] = _general_settings["alert_types"] general_settings["alert_types"] = _general_settings["alert_types"]
proxy_logging_obj.alert_types = general_settings["alert_types"] proxy_logging_obj.alert_types = general_settings["alert_types"]
proxy_logging_obj.slack_alerting_instance.update_values( proxy_logging_obj.slack_alerting_instance.update_values(
alert_types=general_settings["alert_types"] alert_types=general_settings["alert_types"], llm_router=llm_router
) )
if "alert_to_webhook_url" in _general_settings: if "alert_to_webhook_url" in _general_settings:
@ -3015,7 +3015,8 @@ class ProxyConfig:
"alert_to_webhook_url" "alert_to_webhook_url"
] ]
proxy_logging_obj.slack_alerting_instance.update_values( proxy_logging_obj.slack_alerting_instance.update_values(
alert_to_webhook_url=general_settings["alert_to_webhook_url"] alert_to_webhook_url=general_settings["alert_to_webhook_url"],
llm_router=llm_router,
) )
async def _update_general_settings(self, db_general_settings: Optional[Json]): async def _update_general_settings(self, db_general_settings: Optional[Json]):
@ -3583,6 +3584,9 @@ async def startup_event():
## Error Tracking ## ## Error Tracking ##
error_tracking() error_tracking()
## UPDATE SLACK ALERTING ##
proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router)
db_writer_client = HTTPHandler() db_writer_client = HTTPHandler()
proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made proxy_logging_obj._init_litellm_callbacks() # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made

View file

@ -3876,13 +3876,13 @@ class Router:
_api_base = litellm.get_api_base( _api_base = litellm.get_api_base(
model=_model_name, optional_params=temp_litellm_params model=_model_name, optional_params=temp_litellm_params
) )
asyncio.create_task( # asyncio.create_task(
proxy_logging_obj.slack_alerting_instance.send_alert( # proxy_logging_obj.slack_alerting_instance.send_alert(
message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", # message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns",
alert_type="cooldown_deployment", # alert_type="cooldown_deployment",
level="Low", # level="Low",
) # )
) # )
except Exception as e: except Exception as e:
pass pass

View file

@ -572,6 +572,8 @@ async def test_outage_alerting_called(
num_retries=0, num_retries=0,
allowed_fails=100, allowed_fails=100,
) )
slack_alerting.update_values(llm_router=router)
with patch.object( with patch.object(
slack_alerting, "outage_alerts", new=AsyncMock() slack_alerting, "outage_alerts", new=AsyncMock()
) as mock_send_alert: ) as mock_send_alert:

View file

@ -6286,7 +6286,9 @@ def get_model_region(
return None return None
def get_api_base(model: str, optional_params: dict) -> Optional[str]: def get_api_base(
model: str, optional_params: Union[dict, LiteLLM_Params]
) -> Optional[str]:
""" """
Returns the api base used for calling the model. Returns the api base used for calling the model.
@ -6306,7 +6308,9 @@ def get_api_base(model: str, optional_params: dict) -> Optional[str]:
""" """
try: try:
if "model" in optional_params: if isinstance(optional_params, LiteLLM_Params):
_optional_params = optional_params
elif "model" in optional_params:
_optional_params = LiteLLM_Params(**optional_params) _optional_params = LiteLLM_Params(**optional_params)
else: # prevent needing to copy and pop the dict else: # prevent needing to copy and pop the dict
_optional_params = LiteLLM_Params( _optional_params = LiteLLM_Params(
@ -6699,6 +6703,8 @@ def get_llm_provider(
Returns the provider for a given model name - e.g. 'azure/chatgpt-v-2' -> 'azure' Returns the provider for a given model name - e.g. 'azure/chatgpt-v-2' -> 'azure'
For router -> Can also give the whole litellm param dict -> this function will extract the relevant details For router -> Can also give the whole litellm param dict -> this function will extract the relevant details
Raises Error - if unable to map model to a provider
""" """
try: try:
## IF LITELLM PARAMS GIVEN ## ## IF LITELLM PARAMS GIVEN ##