diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 620320f674..9e35b4fc3d 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -4,13 +4,13 @@ import dotenv, os, traceback from litellm.proxy._types import UserAPIKeyAuth, CallInfo, AlertType from litellm._logging import verbose_logger, verbose_proxy_logger import litellm, threading -from typing import List, Literal, Any, Union, Optional, Dict +from typing import List, Literal, Any, Union, Optional, Dict, Set from litellm.caching import DualCache import asyncio, time import aiohttp from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler import datetime -from pydantic import BaseModel +from pydantic import BaseModel, Field from enum import Enum from datetime import datetime as dt, timedelta, timezone from litellm.integrations.custom_logger import CustomLogger @@ -20,17 +20,25 @@ from typing import TypedDict from openai import APIError import litellm.types -import litellm.types.router +from litellm.types.router import LiteLLM_Params -class OutageModel(TypedDict): - model_id: str +class BaseOutageModel(TypedDict): alerts: List[int] - deployment_ids: List[str] minor_alert_sent: bool major_alert_sent: bool last_updated_at: float + +class OutageModel(BaseOutageModel): + model_id: str + + +class ProviderRegionOutageModel(BaseOutageModel): + provider_region_id: str + deployment_ids: Set[str] + + # we use this for the email header, please send a test email if you change this. verify it looks good on email LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png" EMAIL_LOGO_URL = os.getenv( @@ -52,17 +60,55 @@ class LiteLLMBase(BaseModel): return self.dict() +class SlackAlertingArgsEnum(Enum): + daily_report_frequency: int = 12 * 60 * 60 + report_check_interval: int = 5 * 60 + budget_alert_ttl: int = 24 * 60 * 60 + outage_alert_ttl: int = 1 * 60 + region_outage_alert_ttl: int = 1 * 60 + minor_outage_alert_threshold: int = 1 * 5 + major_outage_alert_threshold: int = 1 * 10 + max_outage_alert_list_size: int = 1 * 10 + + class SlackAlertingArgs(LiteLLMBase): - default_daily_report_frequency: int = 12 * 60 * 60 # 12 hours - daily_report_frequency: int = int( - os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency) + daily_report_frequency: int = Field( + default=int( + os.getenv( + "SLACK_DAILY_REPORT_FREQUENCY", + SlackAlertingArgsEnum.daily_report_frequency.value, + ) + ), + description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.", ) - report_check_interval: int = 5 * 60 # 5 minutes - budget_alert_ttl: int = 24 * 60 * 60 # 24 hours - outage_alert_ttl: int = 1 * 60 # 1 minute ttl - minor_outage_alert_threshold: int = 5 - major_outage_alert_threshold: int = 10 - max_outage_alert_list_size: int = 10 # prevent memory leak + report_check_interval: int = Field( + default=SlackAlertingArgsEnum.report_check_interval.value, + description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.", + ) # 5 minutes + budget_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.budget_alert_ttl.value, + description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.", + ) # 24 hours + outage_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.outage_alert_ttl.value, + description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.", + ) # 1 minute ttl + region_outage_alert_ttl: int = Field( + default=SlackAlertingArgsEnum.region_outage_alert_ttl.value, + description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.", + ) # 1 minute ttl + minor_outage_alert_threshold: int = Field( + default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value, + description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).", + ) + major_outage_alert_threshold: int = Field( + default=SlackAlertingArgsEnum.major_outage_alert_threshold.value, + description="The number of errors that countas a model/region major outage. ('400' error code is not counted).", + ) + max_outage_alert_list_size: int = Field( + default=SlackAlertingArgsEnum.max_outage_alert_list_size.value, + description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.", + ) # prevent memory leak class DeploymentMetrics(LiteLLMBase): @@ -736,6 +782,163 @@ class SlackAlerting(CustomLogger): return error_msg + def _outage_alert_msg_factory( + self, + alert_type: Literal["Major", "Minor"], + key: Literal["Model", "Region"], + key_val: str, + provider: str, + api_base: Optional[str], + outage_value: BaseOutageModel, + ) -> str: + """Format an alert message for slack""" + headers = {f"{key} Name": key_val, "Provider": provider} + if api_base is not None: + headers["API Base"] = api_base # type: ignore + + headers_str = "\n" + for k, v in headers.items(): + headers_str += f"*{k}:* `{v}`\n" + return f"""\n\n +*⚠️ {alert_type} Service Outage* + +{headers_str} + +*Errors:* +{self._count_outage_alerts(alerts=outage_value["alerts"])} + +*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n +""" + + async def region_outage_alerts( + self, + exception: APIError, + deployment_id: str, + ) -> None: + """ + Send slack alert if specific provider region is having an outage. + + Track for 408 (Timeout) and >=500 Error codes + """ + ## CREATE (PROVIDER+REGION) ID ## + if self.llm_router is None: + return + + deployment = self.llm_router.get_deployment(model_id=deployment_id) + + if deployment is None: + return + + model = deployment.litellm_params.model + ### GET PROVIDER ### + provider = deployment.litellm_params.custom_llm_provider + if provider is None: + model, provider, _, _ = litellm.get_llm_provider(model=model) + + ### GET REGION ### + region_name = deployment.litellm_params.region_name + if region_name is None: + region_name = litellm.utils._get_model_region( + custom_llm_provider=provider, litellm_params=deployment.litellm_params + ) + + if region_name is None: + return + + ### UNIQUE CACHE KEY ### + cache_key = provider + region_name + + outage_value: Optional[ProviderRegionOutageModel] = ( + await self.internal_usage_cache.async_get_cache(key=cache_key) + ) + + if ( + getattr(exception, "status_code", None) is None + or ( + exception.status_code != 408 # type: ignore + and exception.status_code < 500 # type: ignore + ) + or self.llm_router is None + ): + return + + if outage_value is None: + _deployment_set = set() + _deployment_set.add(deployment_id) + outage_value = ProviderRegionOutageModel( + provider_region_id=cache_key, + alerts=[exception.status_code], # type: ignore + minor_alert_sent=False, + major_alert_sent=False, + last_updated_at=time.time(), + deployment_ids=_deployment_set, + ) + + ## add to cache ## + await self.internal_usage_cache.async_set_cache( + key=cache_key, + value=outage_value, + ttl=self.alerting_args.region_outage_alert_ttl, + ) + return + + if len(outage_value["alerts"]) < self.alerting_args.max_outage_alert_list_size: + outage_value["alerts"].append(exception.status_code) # type: ignore + else: # prevent memory leaks + pass + _deployment_set = outage_value["deployment_ids"] + _deployment_set.add(deployment_id) + outage_value["deployment_ids"] = _deployment_set + outage_value["last_updated_at"] = time.time() + + ## MINOR OUTAGE ALERT SENT ## + if ( + outage_value["minor_alert_sent"] == False + and len(outage_value["alerts"]) + >= self.alerting_args.minor_outage_alert_threshold + and len(_deployment_set) > 1 # make sure it's not just 1 bad deployment + ): + msg = self._outage_alert_msg_factory( + alert_type="Minor", + key="Region", + key_val=region_name, + api_base=None, + outage_value=outage_value, + provider=provider, + ) + # send minor alert + await self.send_alert( + message=msg, level="Medium", alert_type="outage_alerts" + ) + # set to true + outage_value["minor_alert_sent"] = True + + ## MAJOR OUTAGE ALERT SENT ## + elif ( + outage_value["major_alert_sent"] == False + and len(outage_value["alerts"]) + >= self.alerting_args.major_outage_alert_threshold + and len(_deployment_set) > 1 # make sure it's not just 1 bad deployment + ): + msg = self._outage_alert_msg_factory( + alert_type="Major", + key="Region", + key_val=region_name, + api_base=None, + outage_value=outage_value, + provider=provider, + ) + + # send minor alert + await self.send_alert(message=msg, level="High", alert_type="outage_alerts") + # set to true + outage_value["major_alert_sent"] = True + + ## update cache ## + await self.internal_usage_cache.async_set_cache( + key=cache_key, value=outage_value + ) + async def outage_alerts( self, exception: APIError, @@ -787,7 +990,6 @@ class SlackAlerting(CustomLogger): outage_value = OutageModel( model_id=deployment_id, alerts=[exception.status_code], # type: ignore - deployment_ids=[deployment_id], minor_alert_sent=False, major_alert_sent=False, last_updated_at=time.time(), @@ -801,8 +1003,14 @@ class SlackAlerting(CustomLogger): ) return - outage_value["alerts"].append(exception.status_code) # type: ignore - outage_value["deployment_ids"].append(deployment_id) + if ( + len(outage_value["alerts"]) + < self.alerting_args.max_outage_alert_list_size + ): + outage_value["alerts"].append(exception.status_code) # type: ignore + else: # prevent memory leaks + pass + outage_value["last_updated_at"] = time.time() ## MINOR OUTAGE ALERT SENT ## @@ -811,25 +1019,18 @@ class SlackAlerting(CustomLogger): and len(outage_value["alerts"]) >= self.alerting_args.minor_outage_alert_threshold ): - msg = f"""\n\n -*⚠️ Minor Service Outage* - -*Model Name:* `{model}` -*Provider:* `{provider}` -*API Base:* `{api_base}` - -*Errors:* -{self._count_outage_alerts(alerts=outage_value["alerts"])} - - -*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n -""" + msg = self._outage_alert_msg_factory( + alert_type="Minor", + key="Model", + key_val=model, + api_base=api_base, + outage_value=outage_value, + provider=provider, + ) # send minor alert - _result_val = self.send_alert( + await self.send_alert( message=msg, level="Medium", alert_type="outage_alerts" ) - if _result_val is not None: - await _result_val # set to true outage_value["minor_alert_sent"] = True elif ( @@ -837,19 +1038,14 @@ class SlackAlerting(CustomLogger): and len(outage_value["alerts"]) >= self.alerting_args.major_outage_alert_threshold ): - msg = f"""\n\n -*⚠️ Major Service Outage* - -*Model Name:* `{model}` -*Provider:* `{provider}` -*API Base:* `{api_base}` - -*Errors:* -{self._count_outage_alerts(alerts=outage_value["alerts"])} - - -*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n -""" + msg = self._outage_alert_msg_factory( + alert_type="Major", + key="Model", + key_val=model, + api_base=api_base, + outage_value=outage_value, + provider=provider, + ) # send minor alert await self.send_alert( message=msg, level="High", alert_type="outage_alerts" @@ -1103,18 +1299,7 @@ Model Info: self, message: str, level: Literal["Low", "Medium", "High"], - alert_type: Literal[ - "llm_exceptions", - "llm_too_slow", - "llm_requests_hanging", - "budget_alerts", - "db_exceptions", - "daily_reports", - "spend_reports", - "new_model_added", - "cooldown_deployment", - "outage_alerts", - ], + alert_type: Literal[AlertType], user_info: Optional[WebhookEvent] = None, **kwargs, ): @@ -1254,34 +1439,17 @@ Model Info: except Exception as e: verbose_logger.debug(f"Exception raises -{str(e)}") - if "outage_alerts" in self.alert_types and isinstance( - kwargs.get("exception", ""), APIError - ): - _litellm_params = litellm.types.router.LiteLLM_Params( - model=kwargs.get("model", ""), - **kwargs.get("litellm_params", {}), - **kwargs.get("optional_params", {}), - ) - _region_name = litellm.utils._get_model_region( - custom_llm_provider=kwargs.get("custom_llm_provider", ""), - litellm_params=_litellm_params, - ) - # if region name not known, default to api base # - if _region_name is None: - _region_name = litellm.get_api_base( - model=kwargs.get("model", ""), - optional_params={ - **kwargs.get("litellm_params", {}), - **kwargs.get("optional_params", {}), - }, + if isinstance(kwargs.get("exception", ""), APIError): + if "outage_alerts" in self.alert_types: + await self.outage_alerts( + exception=kwargs["exception"], + deployment_id=model_id, ) - if _region_name is None: - _region_name = "" - await self.outage_alerts( - exception=kwargs["exception"], - deployment_id=model_id, - ) + if "region_outage_alerts" in self.alert_types: + await self.region_outage_alerts( + exception=kwargs["exception"], deployment_id=model_id + ) except Exception as e: pass diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index d5b9322462..002df7accf 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -18,6 +18,7 @@ AlertType = Literal[ "cooldown_deployment", "new_model_added", "outage_alerts", + "region_outage_alerts", ] @@ -835,6 +836,7 @@ class ConfigList(LiteLLMBase): field_description: str field_value: Any stored_in_db: Optional[bool] + field_default_value: Any class ConfigGeneralSettings(LiteLLMBase): @@ -912,7 +914,9 @@ class ConfigGeneralSettings(LiteLLMBase): None, description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`", ) - + alerting_args: Optional[Dict] = Field( + None, description="Controllable params for slack alerting - e.g. ttl in cache." + ) alerting_threshold: Optional[int] = Field( None, description="sends alerts if requests hang for 5min+", diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index ed7e999744..456a30aaa5 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -130,6 +130,7 @@ from litellm.proxy.auth.auth_checks import ( ) from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.exceptions import RejectedRequestError +from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting try: from litellm._version import version @@ -3050,6 +3051,13 @@ class ProxyConfig: "global_max_parallel_requests" ] + ## ALERTING ARGS ## + if "alerting_args" in _general_settings: + general_settings["alerting_args"] = _general_settings["alerting_args"] + proxy_logging_obj.slack_alerting_instance.update_values( + alerting_args=general_settings["alerting_args"], + ) + async def add_deployment( self, prisma_client: PrismaClient, @@ -8910,6 +8918,7 @@ async def budget_settings( field_description=field_info.description or "", field_value=db_budget_row_dict.get(field_name, None), stored_in_db=_stored_in_db, + field_default_value=field_info.default, ) return_val.append(_response_obj) @@ -9826,6 +9835,149 @@ async def model_settings(): return returned_list +#### ALERTING MANAGEMENT ENDPOINTS #### + + +@router.get( + "/alerting/settings", + description="Return the configurable alerting param, description, and current value", + tags=["alerting"], + dependencies=[Depends(user_api_key_auth)], + include_in_schema=False, +) +async def alerting_settings( + user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +): + global proxy_logging_obj, prisma_client + """ + Used by UI to generate 'alerting settings' page + { + field_name=field_name, + field_type=allowed_args[field_name]["type"], # string/int + field_description=field_info.description or "", # human-friendly description + field_value=general_settings.get(field_name, None), # example value + } + """ + if prisma_client is None: + raise HTTPException( + status_code=400, + detail={"error": CommonProxyErrors.db_not_connected_error.value}, + ) + + if user_api_key_dict.user_role != "proxy_admin": + raise HTTPException( + status_code=400, + detail={ + "error": "{}, your role={}".format( + CommonProxyErrors.not_allowed_access.value, + user_api_key_dict.user_role, + ) + }, + ) + + ## get general settings from db + db_general_settings = await prisma_client.db.litellm_config.find_first( + where={"param_name": "general_settings"} + ) + + if db_general_settings is not None and db_general_settings.param_value is not None: + db_general_settings_dict = dict(db_general_settings.param_value) + alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {}) # type: ignore + else: + alerting_args_dict = {} + + allowed_args = { + "daily_report_frequency": {"type": "Integer"}, + "report_check_interval": {"type": "Integer"}, + "budget_alert_ttl": {"type": "Integer"}, + "outage_alert_ttl": {"type": "Integer"}, + "region_outage_alert_ttl": {"type": "Integer"}, + "minor_outage_alert_threshold": {"type": "Integer"}, + "major_outage_alert_threshold": {"type": "Integer"}, + "max_outage_alert_list_size": {"type": "Integer"}, + } + + _slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance + _slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump() + + return_val = [] + + for field_name, field_info in SlackAlertingArgs.model_fields.items(): + if field_name in allowed_args: + + _stored_in_db: Optional[bool] = None + if field_name in alerting_args_dict: + _stored_in_db = True + else: + _stored_in_db = False + + _response_obj = ConfigList( + field_name=field_name, + field_type=allowed_args[field_name]["type"], + field_description=field_info.description or "", + field_value=_slack_alerting_args_dict.get(field_name, None), + stored_in_db=_stored_in_db, + field_default_value=field_info.default, + ) + return_val.append(_response_obj) + return return_val + + +# @router.post( +# "/alerting/update", +# description="Update the slack alerting settings. Persist value in db.", +# tags=["alerting"], +# dependencies=[Depends(user_api_key_auth)], +# include_in_schema=False, +# ) +# async def alerting_update( +# data: SlackAlertingArgs, +# user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +# ): +# """Allows updating slack alerting values. Used by UI.""" +# global prisma_client +# if prisma_client is None: +# raise HTTPException( +# status_code=400, +# detail={"error": CommonProxyErrors.db_not_connected_error.value}, +# ) + +# if user_api_key_dict.user_role != "proxy_admin": +# raise HTTPException( +# status_code=400, +# detail={"error": CommonProxyErrors.not_allowed_access.value}, +# ) + +# ## get general settings from db +# db_general_settings = await prisma_client.db.litellm_config.find_first( +# where={"param_name": "general_settings"} +# ) +# ### update value + +# alerting_args_dict = {} +# if db_general_settings is None or db_general_settings.param_value is None: +# general_settings = {} +# alerting_args_dict = {} +# else: +# general_settings = dict(db_general_settings.param_value) +# _alerting_args_dict = general_settings.get("alerting_args", None) +# if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict): +# alerting_args_dict = _alerting_args_dict + + +# alerting_args_dict = data.model + +# response = await prisma_client.db.litellm_config.upsert( +# where={"param_name": "general_settings"}, +# data={ +# "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)}, # type: ignore +# "update": {"param_value": json.dumps(general_settings)}, # type: ignore +# }, +# ) + +# return response + + #### EXPERIMENTAL QUEUING #### async def _litellm_chat_completions_worker(data, user_api_key_dict): """ @@ -10969,6 +11121,7 @@ async def get_config_list( field_description=field_info.description or "", field_value=general_settings.get(field_name, None), stored_in_db=_stored_in_db, + field_default_value=field_info.default, ) return_val.append(_response_obj) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 563496d66b..2bca287e2e 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -100,7 +100,7 @@ class ProxyLogging: "new_model_added", "outage_alerts", ] - self.slack_alerting_instance = SlackAlerting( + self.slack_alerting_instance: SlackAlerting = SlackAlerting( alerting_threshold=self.alerting_threshold, alerting=self.alerting, alert_types=self.alert_types, diff --git a/litellm/tests/test_alerting.py b/litellm/tests/test_alerting.py index 54555464a3..bcec0e6b22 100644 --- a/litellm/tests/test_alerting.py +++ b/litellm/tests/test_alerting.py @@ -576,7 +576,9 @@ async def test_outage_alerting_called( slack_alerting.update_values(llm_router=router) with patch.object( slack_alerting, "outage_alerts", new=AsyncMock() - ) as mock_send_alert: + ) as mock_outage_alert, patch.object( + slack_alerting, "region_outage_alerts", new=AsyncMock() + ) as mock_region_alert: try: await router.acompletion( model=model, @@ -586,7 +588,8 @@ async def test_outage_alerting_called( except Exception as e: pass - mock_send_alert.assert_called_once() + mock_outage_alert.assert_called_once() + mock_region_alert.assert_called_once() with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: for _ in range(6): @@ -600,6 +603,112 @@ async def test_outage_alerting_called( pass await asyncio.sleep(3) if error_code == 500 or error_code == 408: + assert ( + mock_send_alert.assert_called_once() + ) # only model alert. region alert should only trigger for 2+ models in same region + else: + mock_send_alert.assert_not_called() + + +@pytest.mark.parametrize( + "model, api_base, llm_provider, vertex_project, vertex_location", + [ + ("gpt-3.5-turbo", None, "openai", None, None), + ( + "azure/gpt-3.5-turbo", + "https://openai-gpt-4-test-v-1.openai.azure.com", + "azure", + None, + None, + ), + ("gemini-pro", None, "vertex_ai", "hardy-device-38811", "us-central1"), + ], +) +@pytest.mark.parametrize("error_code", [500, 408, 400]) +@pytest.mark.asyncio +async def test_region_outage_alerting_called( + model, api_base, llm_provider, vertex_project, vertex_location, error_code +): + """ + If call fails, outage alert is called + + If multiple calls fail, outage alert is sent + """ + slack_alerting = SlackAlerting( + alerting=["webhook"], alert_types=["region_outage_alerts"] + ) + + litellm.callbacks = [slack_alerting] + + error_to_raise: Optional[APIError] = None + + if error_code == 400: + print("RAISING 400 ERROR CODE") + error_to_raise = litellm.BadRequestError( + message="this is a bad request", + model=model, + llm_provider=llm_provider, + ) + elif error_code == 408: + print("RAISING 408 ERROR CODE") + error_to_raise = litellm.Timeout( + message="A timeout occurred", model=model, llm_provider=llm_provider + ) + elif error_code == 500: + print("RAISING 500 ERROR CODE") + error_to_raise = litellm.ServiceUnavailableError( + message="API is unavailable", + model=model, + llm_provider=llm_provider, + response=httpx.Response( + status_code=503, + request=httpx.Request( + method="completion", + url="https://github.com/BerriAI/litellm", + ), + ), + ) + + router = Router( + model_list=[ + { + "model_name": model, + "litellm_params": { + "model": model, + "api_key": os.getenv("AZURE_API_KEY"), + "api_base": api_base, + "vertex_location": vertex_location, + "vertex_project": vertex_project, + }, + "model_info": {"id": "1"}, + }, + { + "model_name": model, + "litellm_params": { + "model": model, + "api_key": os.getenv("AZURE_API_KEY"), + "api_base": api_base, + "vertex_location": vertex_location, + "vertex_project": "vertex_project-2", + }, + "model_info": {"id": "2"}, + }, + ], + num_retries=0, + allowed_fails=100, + ) + + slack_alerting.update_values(llm_router=router) + with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert: + for idx in range(6): + if idx % 2 == 0: + deployment_id = "1" + else: + deployment_id = "2" + await slack_alerting.region_outage_alerts( + exception=error_to_raise, deployment_id=deployment_id # type: ignore + ) + if model == "gemini-pro" and (error_code == 500 or error_code == 408): mock_send_alert.assert_called_once() else: mock_send_alert.assert_not_called() diff --git a/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx new file mode 100644 index 0000000000..2e34766560 --- /dev/null +++ b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx @@ -0,0 +1,123 @@ +/** + * UI for controlling slack alerting settings + */ +import React, { useState, useEffect } from "react"; +import { + Table, + TableHead, + TableRow, + TableHeaderCell, + TableCell, + Button, + Icon, + Badge, + TableBody, + Text, +} from "@tremor/react"; +import { InputNumber, message } from "antd"; +import { alertingSettingsCall, updateConfigFieldSetting } from "../networking"; +import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline"; +import DynamicForm from "./dynamic_form"; +interface alertingSettingsItem { + field_name: string; + field_type: string; + field_value: any; + field_default_value: any; + field_description: string; + stored_in_db: boolean | null; +} + +interface AlertingSettingsProps { + accessToken: string | null; +} + +const AlertingSettings: React.FC = ({ accessToken }) => { + const [alertingSettings, setAlertingSettings] = useState< + alertingSettingsItem[] + >([]); + + console.log("INSIDE ALERTING SETTINGS"); + useEffect(() => { + // get values + if (!accessToken) { + return; + } + alertingSettingsCall(accessToken).then((data) => { + setAlertingSettings(data); + }); + }, [accessToken]); + + const handleInputChange = (fieldName: string, newValue: any) => { + // Update the value in the state + const updatedSettings = alertingSettings.map((setting) => + setting.field_name === fieldName + ? { ...setting, field_value: newValue } + : setting + ); + setAlertingSettings(updatedSettings); + }; + + const handleSubmit = (formValues: Record) => { + if (!accessToken) { + return; + } + + let fieldValue = formValues; + + if (fieldValue == null || fieldValue == undefined) { + return; + } + + const initialFormValues: Record = {}; + alertingSettings.forEach((setting) => { + initialFormValues[setting.field_name] = setting.field_value; + }); + + // Merge initialFormValues with actual formValues + const mergedFormValues = { ...formValues, ...initialFormValues }; + try { + updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues); + // update value in state + message.success("Wait 10s for proxy to update."); + } catch (error) { + // do something + } + }; + + const handleResetField = (fieldName: string, idx: number) => { + if (!accessToken) { + return; + } + + try { + // deleteConfigFieldSetting(accessToken, fieldName); + // update value in state + + const updatedSettings = alertingSettings.map((setting) => + setting.field_name === fieldName + ? { + ...setting, + stored_in_db: null, + field_value: setting.field_default_value, + } + : setting + ); + console.log("INSIDE HANDLE RESET FIELD"); + setAlertingSettings(updatedSettings); + } catch (error) { + // do something + console.log("ERROR OCCURRED!"); + } + }; + + return ( + + ); +}; + +export default AlertingSettings; diff --git a/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx new file mode 100644 index 0000000000..c271712c93 --- /dev/null +++ b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx @@ -0,0 +1,96 @@ +import React from "react"; +import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd"; +import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline"; +import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react"; +import Paragraph from "antd/es/typography/Paragraph"; +interface AlertingSetting { + field_name: string; + field_description: string; + field_type: string; + field_value: any; + stored_in_db: boolean | null; +} + +interface DynamicFormProps { + alertingSettings: AlertingSetting[]; + handleInputChange: (fieldName: string, newValue: any) => void; + handleResetField: (fieldName: string, index: number) => void; + handleSubmit: (formValues: Record) => void; +} + +const DynamicForm: React.FC = ({ + alertingSettings, + handleInputChange, + handleResetField, + handleSubmit, +}) => { + const [form] = Form.useForm(); + + const onFinish = () => { + const formData = form.getFieldsValue(); + handleSubmit(formData); + }; + + return ( +
+ {alertingSettings.map((value, index) => ( + + + {value.field_name} +

+ {value.field_description} +

+
+ + + {value.field_type === "Integer" ? ( + handleInputChange(value.field_name, e)} + /> + ) : ( + handleInputChange(value.field_name, e)} + /> + )} + + + + {value.stored_in_db == true ? ( + + In DB + + ) : value.stored_in_db == false ? ( + In Config + ) : ( + Not Set + )} + + + handleResetField(value.field_name, index)} + > + Reset + + +
+ ))} +
+ Update Settings +
+
+ ); +}; + +export default DynamicForm; diff --git a/ui/litellm-dashboard/src/components/general_settings.tsx b/ui/litellm-dashboard/src/components/general_settings.tsx index d16b434b89..62e37f1755 100644 --- a/ui/litellm-dashboard/src/components/general_settings.tsx +++ b/ui/litellm-dashboard/src/components/general_settings.tsx @@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon import AddFallbacks from "./add_fallbacks"; import openai from "openai"; import Paragraph from "antd/es/skeleton/Paragraph"; - interface GeneralSettingsPageProps { accessToken: string | null; userRole: string | null; diff --git a/ui/litellm-dashboard/src/components/networking.tsx b/ui/litellm-dashboard/src/components/networking.tsx index e7678ba10a..28964a4a8b 100644 --- a/ui/litellm-dashboard/src/components/networking.tsx +++ b/ui/litellm-dashboard/src/components/networking.tsx @@ -207,6 +207,41 @@ export const budgetCreateCall = async ( throw error; } }; + +export const alertingSettingsCall = async (accessToken: String) => { + /** + * Get all configurable params for setting a model + */ + try { + let url = proxyBaseUrl + ? `${proxyBaseUrl}/alerting/settings` + : `/alerting/settings`; + + //message.info("Requesting model data"); + const response = await fetch(url, { + method: "GET", + headers: { + Authorization: `Bearer ${accessToken}`, + "Content-Type": "application/json", + }, + }); + + if (!response.ok) { + const errorData = await response.text(); + message.error(errorData, 10); + throw new Error("Network response was not ok"); + } + + const data = await response.json(); + //message.info("Received model data"); + return data; + // Handle success - you might want to update some state or UI based on the created key + } catch (error) { + console.error("Failed to get callbacks:", error); + throw error; + } +}; + export const keyCreateCall = async ( accessToken: string, userID: string, @@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async ( } }; -export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => { +export const adminspendByProvider = async ( + accessToken: String, + keyToken: String | null, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/spend/provider` + : `/global/spend/provider`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String } }; -export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { +export const adminGlobalActivity = async ( + accessToken: String, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/activity` + : `/global/activity`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String } }; - -export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => { +export const adminGlobalActivityPerModel = async ( + accessToken: String, + startTime: String | undefined, + endTime: String | undefined +) => { try { - let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`; + let url = proxyBaseUrl + ? `${proxyBaseUrl}/global/activity/model` + : `/global/activity/model`; if (startTime && endTime) { url += `?start_date=${startTime}&end_date=${endTime}`; @@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime } }; - export const adminTopModelsCall = async (accessToken: String) => { try { let url = proxyBaseUrl diff --git a/ui/litellm-dashboard/src/components/settings.tsx b/ui/litellm-dashboard/src/components/settings.tsx index 86d9cbfd3b..db330905ae 100644 --- a/ui/litellm-dashboard/src/components/settings.tsx +++ b/ui/litellm-dashboard/src/components/settings.tsx @@ -31,7 +31,7 @@ import { } from "./networking"; import { Modal, Form, Input, Select, Button as Button2, message } from "antd"; import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider"; - +import AlertingSettings from "./alerting/alerting_settings"; interface SettingsPageProps { accessToken: string | null; userRole: string | null; @@ -117,6 +117,7 @@ const Settings: React.FC = ({ db_exceptions: "Database Exceptions (Read/Write)", daily_reports: "Weekly/Monthly Spend Reports", outage_alerts: "Outage Alerts", + region_outage_alerts: "Region Outage Alerts", }; useEffect(() => { @@ -365,7 +366,8 @@ const Settings: React.FC = ({ Logging Callbacks - Alerting + Alerting Types + Alerting Settings @@ -496,6 +498,9 @@ const Settings: React.FC = ({ + + +