diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index a4ff0620ad..0cea0fee2b 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -1,6 +1,6 @@ #### What this does #### # Class for sending Slack Alerts # -import dotenv, os +import dotenv, os, traceback from litellm.proxy._types import UserAPIKeyAuth, CallInfo from litellm._logging import verbose_logger, verbose_proxy_logger import litellm, threading @@ -15,6 +15,35 @@ from enum import Enum from datetime import datetime as dt, timedelta, timezone from litellm.integrations.custom_logger import CustomLogger import random +from typing import TypedDict +from openai import APIError + +import litellm.types +import litellm.types.router + + +class OutageModel(TypedDict): + provider: str + region_name: str + alerts: List[str] + deployment_ids: List[str] + minor_alert_sent: bool + major_alert_sent: bool + last_updated_at: float + + +AlertType = Literal[ + "llm_exceptions", + "llm_too_slow", + "llm_requests_hanging", + "budget_alerts", + "db_exceptions", + "daily_reports", + "spend_reports", + "cooldown_deployment", + "new_model_added", + "outage_alerts", +] class LiteLLMBase(BaseModel): @@ -37,6 +66,10 @@ class SlackAlertingArgs(LiteLLMBase): ) report_check_interval: int = 5 * 60 # 5 minutes budget_alert_ttl: int = 24 * 60 * 60 # 24 hours + outage_alert_ttl: int = 1 * 60 * 60 # 1 hour + minor_outage_alert_threshold: int = 3 + major_outage_alert_threshold: int = 10 + max_outage_alert_list_size: int = 10 # prevent memory leak class WebhookEvent(CallInfo): @@ -86,19 +119,7 @@ class SlackAlerting(CustomLogger): internal_usage_cache: Optional[DualCache] = None, alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds) alerting: Optional[List] = [], - alert_types: List[ - Literal[ - "llm_exceptions", - "llm_too_slow", - "llm_requests_hanging", - "budget_alerts", - "db_exceptions", - "daily_reports", - "spend_reports", - "cooldown_deployment", - "new_model_added", - ] - ] = [ + alert_types: List[AlertType] = [ "llm_exceptions", "llm_too_slow", "llm_requests_hanging", @@ -108,6 +129,7 @@ class SlackAlerting(CustomLogger): "spend_reports", "cooldown_deployment", "new_model_added", + "outage_alerts", ], alert_to_webhook_url: Optional[ Dict @@ -696,6 +718,99 @@ class SlackAlerting(CustomLogger): return return + async def outage_alerts( + self, + provider: str, + region_name: str, + exception: APIError, + deployment_id: str, + ) -> None: + """ + Send slack alert if provider region (e.g. azure east-us-1) is having an outage (408 or >500 errors). + + key = (provider + region) + + value = { + - provider + - region + - threshold + - alerts [] + } + + ttl = 1hr + max_alerts_size = 10 + """ + _id = provider + region_name + + outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=_id) # type: ignore + + if ( + getattr(exception, "status_code", None) is not None + and exception.status_code != 408 # type: ignore + and exception.status_code < 500 # type: ignore + ): + return + + if outage_value is None: + outage_value = OutageModel( + provider=provider, + region_name=region_name, + alerts=[exception.message], + deployment_ids=[deployment_id], + minor_alert_sent=False, + major_alert_sent=False, + last_updated_at=time.time(), + ) + + ## add to cache ## + await self.internal_usage_cache.async_set_cache( + key=_id, value=outage_value, ttl=self.alerting_args.outage_alert_ttl + ) + return + + outage_value["alerts"].append(exception.message) + outage_value["deployment_ids"].append(deployment_id) + outage_value["last_updated_at"] = time.time() + + ## MINOR OUTAGE ALERT SENT ## + if ( + outage_value["minor_alert_sent"] == False + and len(outage_value["alerts"]) + > self.alerting_args.minor_outage_alert_threshold + ): + msg = "{} {} is having a **Minor Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( + provider, + region_name, + outage_value["alerts"], + outage_value["last_updated_at"], + ) + # send minor alert + _result_val = self.send_alert( + message=msg, level="Medium", alert_type="outage_alerts" + ) + if _result_val is not None: + await _result_val + # set to true + outage_value["minor_alert_sent"] = True + elif ( + outage_value["major_alert_sent"] == False + and len(outage_value["alerts"]) + > self.alerting_args.major_outage_alert_threshold + ): + msg = "{} {} is having a **Major Service Outage**.\n\n**Errors**\n{}\n\nLast Check:{}".format( + provider, + region_name, + outage_value["alerts"], + outage_value["last_updated_at"], + ) + # send minor alert + await self.send_alert(message=msg, level="High", alert_type="outage_alerts") + # set to true + outage_value["major_alert_sent"] = True + + ## update cache ## + await self.internal_usage_cache.async_set_cache(key=_id, value=outage_value) + async def model_added_alert( self, model_name: str, litellm_model_name: str, passed_model_info: Any ): @@ -745,10 +860,12 @@ Model Info: ``` """ - await self.send_alert( + alert_val = self.send_alert( message=message, level="Low", alert_type="new_model_added" ) - pass + + if alert_val is not None and asyncio.iscoroutine(alert_val): + await alert_val async def model_removed_alert(self, model_name: str): pass @@ -795,6 +912,7 @@ Model Info: "spend_reports", "new_model_added", "cooldown_deployment", + "outage_alerts", ], user_info: Optional[WebhookEvent] = None, **kwargs, @@ -910,18 +1028,55 @@ Model Info: async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): """Log failure + deployment latency""" - if "daily_reports" in self.alert_types: - model_id = ( - kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "") - ) - await self.async_update_daily_reports( - DeploymentMetrics( - id=model_id, - failed_request=True, - latency_per_output_token=None, - updated_at=litellm.utils.get_utc_datetime(), + _litellm_params = kwargs.get("litellm_params", {}) + _model_info = _litellm_params.get("model_info", {}) or {} + model_id = _model_info.get("id", "") + try: + if "daily_reports" in self.alert_types: + try: + await self.async_update_daily_reports( + DeploymentMetrics( + id=model_id, + failed_request=True, + latency_per_output_token=None, + updated_at=litellm.utils.get_utc_datetime(), + ) + ) + except Exception as e: + verbose_logger.debug(f"Exception raises -{str(e)}") + + if "outage_alerts" in self.alert_types and isinstance( + kwargs.get("exception", ""), APIError + ): + _litellm_params = litellm.types.router.LiteLLM_Params( + model=kwargs.get("model", ""), + **kwargs.get("litellm_params", {}), + **kwargs.get("optional_params", {}), ) - ) + _region_name = litellm.utils._get_model_region( + custom_llm_provider=kwargs.get("custom_llm_provider", ""), + litellm_params=_litellm_params, + ) + # if region name not known, default to api base # + if _region_name is None: + _region_name = litellm.get_api_base( + model=kwargs.get("model", ""), + optional_params={ + **kwargs.get("litellm_params", {}), + **kwargs.get("optional_params", {}), + }, + ) + if _region_name is None: + _region_name = "" + + await self.outage_alerts( + provider=kwargs.get("custom_llm_provider", "") or "", + region_name=_region_name, + exception=kwargs["exception"], + deployment_id=model_id, + ) + except Exception as e: + pass async def _run_scheduler_helper(self, llm_router) -> bool: """ diff --git a/litellm/main.py b/litellm/main.py index dc4cf0001e..9ff474af1f 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -420,6 +420,8 @@ def mock_completion( api_key="mock-key", ) if isinstance(mock_response, Exception): + if isinstance(mock_response, openai.APIError): + raise mock_response raise litellm.APIError( status_code=500, # type: ignore message=str(mock_response), @@ -463,7 +465,9 @@ def mock_completion( return model_response - except: + except Exception as e: + if isinstance(e, openai.APIError): + raise e traceback.print_exc() raise Exception("Mock completion response failed") diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index b22134dab3..33e2114c2d 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -42,7 +42,7 @@ import smtplib, re from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from datetime import datetime, timedelta -from litellm.integrations.slack_alerting import SlackAlerting +from litellm.integrations.slack_alerting import SlackAlerting, AlertType from typing_extensions import overload @@ -78,19 +78,7 @@ class ProxyLogging: self.cache_control_check = _PROXY_CacheControlCheck() self.alerting: Optional[List] = None self.alerting_threshold: float = 300 # default to 5 min. threshold - self.alert_types: List[ - Literal[ - "llm_exceptions", - "llm_too_slow", - "llm_requests_hanging", - "budget_alerts", - "db_exceptions", - "daily_reports", - "spend_reports", - "cooldown_deployment", - "new_model_added", - ] - ] = [ + self.alert_types: List[AlertType] = [ "llm_exceptions", "llm_too_slow", "llm_requests_hanging", @@ -100,6 +88,7 @@ class ProxyLogging: "spend_reports", "cooldown_deployment", "new_model_added", + "outage_alerts", ] self.slack_alerting_instance = SlackAlerting( alerting_threshold=self.alerting_threshold, @@ -113,21 +102,7 @@ class ProxyLogging: alerting: Optional[List], alerting_threshold: Optional[float], redis_cache: Optional[RedisCache], - alert_types: Optional[ - List[ - Literal[ - "llm_exceptions", - "llm_too_slow", - "llm_requests_hanging", - "budget_alerts", - "db_exceptions", - "daily_reports", - "spend_reports", - "cooldown_deployment", - "new_model_added", - ] - ] - ] = None, + alert_types: Optional[List[AlertType]] = None, alerting_args: Optional[dict] = None, ): self.alerting = alerting diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index 2f8d7f3efd..7a4214bbd8 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -1,10 +1,11 @@ # What is this? ## Tests slack alerting on proxy logging object -import sys, json, uuid, random +import sys, json, uuid, random, httpx import os import io, asyncio from datetime import datetime, timedelta +from typing import Optional # import logging # logging.basicConfig(level=logging.DEBUG) @@ -23,6 +24,7 @@ from unittest.mock import AsyncMock import pytest from litellm.router import AlertingConfig, Router from litellm.proxy._types import CallInfo +from openai import APIError @pytest.mark.parametrize( @@ -495,3 +497,96 @@ async def test_webhook_alerting(alerting_type): user_info=user_info, ) mock_send_alert.assert_awaited_once() + + +@pytest.mark.parametrize( + "model, api_base, llm_provider, vertex_project, vertex_location", + [ + ("gpt-3.5-turbo", None, "openai", None, None), + ( + "azure/gpt-3.5-turbo", + "https://openai-gpt-4-test-v-1.openai.azure.com", + "azure", + None, + None, + ), + ("gemini-pro", None, "vertex_ai", "hardy-device-38811", "us-central1"), + ], +) +@pytest.mark.parametrize("error_code", [500, 408, 400]) +@pytest.mark.asyncio +async def test_outage_alerting_called( + model, api_base, llm_provider, vertex_project, vertex_location, error_code +): + """ + If call fails, outage alert is called + + If multiple calls fail, outage alert is sent + """ + slack_alerting = SlackAlerting(alerting=["webhook"]) + + litellm.callbacks = [slack_alerting] + + error_to_raise: Optional[APIError] = None + + if error_code == 400: + print("RAISING 400 ERROR CODE") + error_to_raise = litellm.BadRequestError( + message="this is a bad request", + model=model, + llm_provider=llm_provider, + ) + elif error_code == 408: + print("RAISING 408 ERROR CODE") + error_to_raise = litellm.Timeout( + message="A timeout occurred", model=model, llm_provider=llm_provider + ) + elif error_code == 500: + print("RAISING 500 ERROR CODE") + error_to_raise = litellm.ServiceUnavailableError( + message="API is unavailable", + model=model, + llm_provider=llm_provider, + response=httpx.Response( + status_code=503, + request=httpx.Request( + method="completion", + url="https://github.com/BerriAI/litellm", + ), + ), + ) + with patch.object( + slack_alerting, "outage_alerts", new=AsyncMock() + ) as mock_send_alert: + try: + await litellm.acompletion( + model=model, + messages=[{"role": "user", "content": "Hey!"}], + api_base=api_base, + vertex_location=vertex_location, + vertex_project=vertex_project, + mock_response=error_to_raise, + ) + except Exception as e: + pass + + mock_send_alert.assert_called_once() + + with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: + for _ in range(3): + try: + await litellm.acompletion( + model=model, + messages=[{"role": "user", "content": "Hey!"}], + api_base=api_base, + vertex_location=vertex_location, + vertex_project=vertex_project, + mock_response=error_to_raise, + ) + except Exception as e: + pass + + if error_code == 500 or error_code == 408: + mock_send_alert.assert_called_once() + else: + mock_send_alert.assert_not_called() diff --git a/litellm/utils.py b/litellm/utils.py index 0f2a46f68d..1ed70a9428 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8632,7 +8632,16 @@ def exception_type( ) elif hasattr(original_exception, "status_code"): exception_mapping_worked = True - if original_exception.status_code == 401: + if original_exception.status_code == 400: + exception_mapping_worked = True + raise BadRequestError( + message=f"{exception_provider} - {message}", + llm_provider=custom_llm_provider, + model=model, + response=original_exception.response, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 401: exception_mapping_worked = True raise AuthenticationError( message=f"{exception_provider} - {message}", @@ -9145,6 +9154,7 @@ def exception_type( ), ), ) + if hasattr(original_exception, "status_code"): if original_exception.status_code == 400: exception_mapping_worked = True @@ -9825,7 +9835,16 @@ def exception_type( ) elif hasattr(original_exception, "status_code"): exception_mapping_worked = True - if original_exception.status_code == 401: + if original_exception.status_code == 400: + exception_mapping_worked = True + raise BadRequestError( + message=f"AzureException - {original_exception.message}", + llm_provider="azure", + model=model, + litellm_debug_info=extra_information, + response=original_exception.response, + ) + elif original_exception.status_code == 401: exception_mapping_worked = True raise AuthenticationError( message=f"AzureException - {original_exception.message}", @@ -9842,7 +9861,7 @@ def exception_type( litellm_debug_info=extra_information, llm_provider="azure", ) - if original_exception.status_code == 422: + elif original_exception.status_code == 422: exception_mapping_worked = True raise BadRequestError( message=f"AzureException - {original_exception.message}",