Merge pull request #3844 from BerriAI/litellm_region_based_alerts

feat(slack_alerting.py): enable provider-region based alerting
2025-04-27 03:34:10 +00:00 · 2024-05-25 21:03:16 -07:00 · 2024-05-25 21:03:16 -07:00 · 960fa8b326
commit 960fa8b326
parent 03d8e0b651 1f42d086d6
10 changed files with 809 additions and 100 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -4,13 +4,13 @@ import dotenv, os, traceback
 from litellm.proxy._types import UserAPIKeyAuth, CallInfo, AlertType
 from litellm._logging import verbose_logger, verbose_proxy_logger
 import litellm, threading
-from typing import List, Literal, Any, Union, Optional, Dict
+from typing import List, Literal, Any, Union, Optional, Dict, Set
 from litellm.caching import DualCache
 import asyncio, time
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from enum import Enum
 from datetime import datetime as dt, timedelta, timezone
 from litellm.integrations.custom_logger import CustomLogger
@ -20,17 +20,25 @@ from typing import TypedDict
 from openai import APIError

 import litellm.types
-import litellm.types.router
+from litellm.types.router import LiteLLM_Params


-class OutageModel(TypedDict):
-    model_id: str
+class BaseOutageModel(TypedDict):
    alerts: List[int]
-    deployment_ids: List[str]
    minor_alert_sent: bool
    major_alert_sent: bool
    last_updated_at: float

+
+class OutageModel(BaseOutageModel):
+    model_id: str
+
+
+class ProviderRegionOutageModel(BaseOutageModel):
+    provider_region_id: str
+    deployment_ids: Set[str]
+
+
 # we use this for the email header, please send a test email if you change this. verify it looks good on email
 LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
 EMAIL_LOGO_URL = os.getenv(
@ -52,17 +60,55 @@ class LiteLLMBase(BaseModel):
            return self.dict()


+class SlackAlertingArgsEnum(Enum):
+    daily_report_frequency: int = 12 * 60 * 60
+    report_check_interval: int = 5 * 60
+    budget_alert_ttl: int = 24 * 60 * 60
+    outage_alert_ttl: int = 1 * 60
+    region_outage_alert_ttl: int = 1 * 60
+    minor_outage_alert_threshold: int = 1 * 5
+    major_outage_alert_threshold: int = 1 * 10
+    max_outage_alert_list_size: int = 1 * 10
+
+
 class SlackAlertingArgs(LiteLLMBase):
-    default_daily_report_frequency: int = 12 * 60 * 60  # 12 hours
-    daily_report_frequency: int = int(
-        os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency)
+    daily_report_frequency: int = Field(
+        default=int(
+            os.getenv(
+                "SLACK_DAILY_REPORT_FREQUENCY",
+                SlackAlertingArgsEnum.daily_report_frequency.value,
+            )
+        ),
+        description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.",
    )
-    report_check_interval: int = 5 * 60  # 5 minutes
-    budget_alert_ttl: int = 24 * 60 * 60  # 24 hours
-    outage_alert_ttl: int = 1 * 60  # 1 minute ttl
-    minor_outage_alert_threshold: int = 5
-    major_outage_alert_threshold: int = 10
-    max_outage_alert_list_size: int = 10  # prevent memory leak
+    report_check_interval: int = Field(
+        default=SlackAlertingArgsEnum.report_check_interval.value,
+        description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.",
+    )  # 5 minutes
+    budget_alert_ttl: int = Field(
+        default=SlackAlertingArgsEnum.budget_alert_ttl.value,
+        description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.",
+    )  # 24 hours
+    outage_alert_ttl: int = Field(
+        default=SlackAlertingArgsEnum.outage_alert_ttl.value,
+        description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
+    )  # 1 minute ttl
+    region_outage_alert_ttl: int = Field(
+        default=SlackAlertingArgsEnum.region_outage_alert_ttl.value,
+        description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
+    )  # 1 minute ttl
+    minor_outage_alert_threshold: int = Field(
+        default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value,
+        description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).",
+    )
+    major_outage_alert_threshold: int = Field(
+        default=SlackAlertingArgsEnum.major_outage_alert_threshold.value,
+        description="The number of errors that countas a model/region major outage. ('400' error code is not counted).",
+    )
+    max_outage_alert_list_size: int = Field(
+        default=SlackAlertingArgsEnum.max_outage_alert_list_size.value,
+        description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.",
+    )  # prevent memory leak


 class DeploymentMetrics(LiteLLMBase):
@ -736,6 +782,163 @@ class SlackAlerting(CustomLogger):

        return error_msg

+    def _outage_alert_msg_factory(
+        self,
+        alert_type: Literal["Major", "Minor"],
+        key: Literal["Model", "Region"],
+        key_val: str,
+        provider: str,
+        api_base: Optional[str],
+        outage_value: BaseOutageModel,
+    ) -> str:
+        """Format an alert message for slack"""
+        headers = {f"{key} Name": key_val, "Provider": provider}
+        if api_base is not None:
+            headers["API Base"] = api_base  # type: ignore
+
+        headers_str = "\n"
+        for k, v in headers.items():
+            headers_str += f"*{k}:* `{v}`\n"
+        return f"""\n\n
+*⚠️ {alert_type} Service Outage*
+
+{headers_str}
+
+*Errors:*
+{self._count_outage_alerts(alerts=outage_value["alerts"])}
+
+*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
+"""
+
+    async def region_outage_alerts(
+        self,
+        exception: APIError,
+        deployment_id: str,
+    ) -> None:
+        """
+        Send slack alert if specific provider region is having an outage.
+
+        Track for 408 (Timeout) and >=500 Error codes
+        """
+        ## CREATE (PROVIDER+REGION) ID ##
+        if self.llm_router is None:
+            return
+
+        deployment = self.llm_router.get_deployment(model_id=deployment_id)
+
+        if deployment is None:
+            return
+
+        model = deployment.litellm_params.model
+        ### GET PROVIDER ###
+        provider = deployment.litellm_params.custom_llm_provider
+        if provider is None:
+            model, provider, _, _ = litellm.get_llm_provider(model=model)
+
+        ### GET REGION ###
+        region_name = deployment.litellm_params.region_name
+        if region_name is None:
+            region_name = litellm.utils._get_model_region(
+                custom_llm_provider=provider, litellm_params=deployment.litellm_params
+            )
+
+        if region_name is None:
+            return
+
+        ### UNIQUE CACHE KEY ###
+        cache_key = provider + region_name
+
+        outage_value: Optional[ProviderRegionOutageModel] = (
+            await self.internal_usage_cache.async_get_cache(key=cache_key)
+        )
+
+        if (
+            getattr(exception, "status_code", None) is None
+            or (
+                exception.status_code != 408  # type: ignore
+                and exception.status_code < 500  # type: ignore
+            )
+            or self.llm_router is None
+        ):
+            return
+
+        if outage_value is None:
+            _deployment_set = set()
+            _deployment_set.add(deployment_id)
+            outage_value = ProviderRegionOutageModel(
+                provider_region_id=cache_key,
+                alerts=[exception.status_code],  # type: ignore
+                minor_alert_sent=False,
+                major_alert_sent=False,
+                last_updated_at=time.time(),
+                deployment_ids=_deployment_set,
+            )
+
+            ## add to cache ##
+            await self.internal_usage_cache.async_set_cache(
+                key=cache_key,
+                value=outage_value,
+                ttl=self.alerting_args.region_outage_alert_ttl,
+            )
+            return
+
+        if len(outage_value["alerts"]) < self.alerting_args.max_outage_alert_list_size:
+            outage_value["alerts"].append(exception.status_code)  # type: ignore
+        else:  # prevent memory leaks
+            pass
+        _deployment_set = outage_value["deployment_ids"]
+        _deployment_set.add(deployment_id)
+        outage_value["deployment_ids"] = _deployment_set
+        outage_value["last_updated_at"] = time.time()
+
+        ## MINOR OUTAGE ALERT SENT ##
+        if (
+            outage_value["minor_alert_sent"] == False
+            and len(outage_value["alerts"])
+            >= self.alerting_args.minor_outage_alert_threshold
+            and len(_deployment_set) > 1  # make sure it's not just 1 bad deployment
+        ):
+            msg = self._outage_alert_msg_factory(
+                alert_type="Minor",
+                key="Region",
+                key_val=region_name,
+                api_base=None,
+                outage_value=outage_value,
+                provider=provider,
+            )
+            # send minor alert
+            await self.send_alert(
+                message=msg, level="Medium", alert_type="outage_alerts"
+            )
+            # set to true
+            outage_value["minor_alert_sent"] = True
+
+        ## MAJOR OUTAGE ALERT SENT ##
+        elif (
+            outage_value["major_alert_sent"] == False
+            and len(outage_value["alerts"])
+            >= self.alerting_args.major_outage_alert_threshold
+            and len(_deployment_set) > 1  # make sure it's not just 1 bad deployment
+        ):
+            msg = self._outage_alert_msg_factory(
+                alert_type="Major",
+                key="Region",
+                key_val=region_name,
+                api_base=None,
+                outage_value=outage_value,
+                provider=provider,
+            )
+
+            # send minor alert
+            await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
+            # set to true
+            outage_value["major_alert_sent"] = True
+
+        ## update cache ##
+        await self.internal_usage_cache.async_set_cache(
+            key=cache_key, value=outage_value
+        )
+
    async def outage_alerts(
        self,
        exception: APIError,
@ -787,7 +990,6 @@ class SlackAlerting(CustomLogger):
                outage_value = OutageModel(
                    model_id=deployment_id,
                    alerts=[exception.status_code],  # type: ignore
-                    deployment_ids=[deployment_id],
                    minor_alert_sent=False,
                    major_alert_sent=False,
                    last_updated_at=time.time(),
@ -801,8 +1003,14 @@ class SlackAlerting(CustomLogger):
                )
                return

-            outage_value["alerts"].append(exception.status_code)  # type: ignore
-            outage_value["deployment_ids"].append(deployment_id)
+            if (
+                len(outage_value["alerts"])
+                < self.alerting_args.max_outage_alert_list_size
+            ):
+                outage_value["alerts"].append(exception.status_code)  # type: ignore
+            else:  # prevent memory leaks
+                pass
+
            outage_value["last_updated_at"] = time.time()

            ## MINOR OUTAGE ALERT SENT ##
@ -811,25 +1019,18 @@ class SlackAlerting(CustomLogger):
                and len(outage_value["alerts"])
                >= self.alerting_args.minor_outage_alert_threshold
            ):
-                msg = f"""\n\n
-*⚠️ Minor Service Outage*
-
-*Model Name:* `{model}`
-*Provider:* `{provider}`
-*API Base:* `{api_base}`
-
-*Errors:*
-{self._count_outage_alerts(alerts=outage_value["alerts"])}
-
-
-*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
-"""
+                msg = self._outage_alert_msg_factory(
+                    alert_type="Minor",
+                    key="Model",
+                    key_val=model,
+                    api_base=api_base,
+                    outage_value=outage_value,
+                    provider=provider,
+                )
                # send minor alert
-                _result_val = self.send_alert(
+                await self.send_alert(
                    message=msg, level="Medium", alert_type="outage_alerts"
                )
-                if _result_val is not None:
-                    await _result_val
                # set to true
                outage_value["minor_alert_sent"] = True
            elif (
@ -837,19 +1038,14 @@ class SlackAlerting(CustomLogger):
                and len(outage_value["alerts"])
                >= self.alerting_args.major_outage_alert_threshold
            ):
-                msg = f"""\n\n
-*⚠️ Major Service Outage*
-
-*Model Name:* `{model}`
-*Provider:* `{provider}`
-*API Base:* `{api_base}`
-
-*Errors:*
-{self._count_outage_alerts(alerts=outage_value["alerts"])}
-
-
-*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
-"""
+                msg = self._outage_alert_msg_factory(
+                    alert_type="Major",
+                    key="Model",
+                    key_val=model,
+                    api_base=api_base,
+                    outage_value=outage_value,
+                    provider=provider,
+                )
                # send minor alert
                await self.send_alert(
                    message=msg, level="High", alert_type="outage_alerts"
@ -1103,18 +1299,7 @@ Model Info:
        self,
        message: str,
        level: Literal["Low", "Medium", "High"],
-        alert_type: Literal[
-            "llm_exceptions",
-            "llm_too_slow",
-            "llm_requests_hanging",
-            "budget_alerts",
-            "db_exceptions",
-            "daily_reports",
-            "spend_reports",
-            "new_model_added",
-            "cooldown_deployment",
-            "outage_alerts",
-        ],
+        alert_type: Literal[AlertType],
        user_info: Optional[WebhookEvent] = None,
        **kwargs,
    ):
@ -1254,34 +1439,17 @@ Model Info:
                except Exception as e:
                    verbose_logger.debug(f"Exception raises -{str(e)}")

-            if "outage_alerts" in self.alert_types and isinstance(
-                kwargs.get("exception", ""), APIError
-            ):
-                _litellm_params = litellm.types.router.LiteLLM_Params(
-                    model=kwargs.get("model", ""),
-                    **kwargs.get("litellm_params", {}),
-                    **kwargs.get("optional_params", {}),
-                )
-                _region_name = litellm.utils._get_model_region(
-                    custom_llm_provider=kwargs.get("custom_llm_provider", ""),
-                    litellm_params=_litellm_params,
-                )
-                # if region name not known, default to api base #
-                if _region_name is None:
-                    _region_name = litellm.get_api_base(
-                        model=kwargs.get("model", ""),
-                        optional_params={
-                            **kwargs.get("litellm_params", {}),
-                            **kwargs.get("optional_params", {}),
-                        },
+            if isinstance(kwargs.get("exception", ""), APIError):
+                if "outage_alerts" in self.alert_types:
+                    await self.outage_alerts(
+                        exception=kwargs["exception"],
+                        deployment_id=model_id,
                    )
-                    if _region_name is None:
-                        _region_name = ""

-                await self.outage_alerts(
-                    exception=kwargs["exception"],
-                    deployment_id=model_id,
-                )
+                if "region_outage_alerts" in self.alert_types:
+                    await self.region_outage_alerts(
+                        exception=kwargs["exception"], deployment_id=model_id
+                    )
        except Exception as e:
            pass

--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -18,6 +18,7 @@ AlertType = Literal[
    "cooldown_deployment",
    "new_model_added",
    "outage_alerts",
+    "region_outage_alerts",
 ]


@ -835,6 +836,7 @@ class ConfigList(LiteLLMBase):
    field_description: str
    field_value: Any
    stored_in_db: Optional[bool]
+    field_default_value: Any


 class ConfigGeneralSettings(LiteLLMBase):
@ -912,7 +914,9 @@ class ConfigGeneralSettings(LiteLLMBase):
        None,
        description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
    )
-
+    alerting_args: Optional[Dict] = Field(
+        None, description="Controllable params for slack alerting - e.g. ttl in cache."
+    )
    alerting_threshold: Optional[int] = Field(
        None,
        description="sends alerts if requests hang for 5min+",
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -130,6 +130,7 @@ from litellm.proxy.auth.auth_checks import (
 )
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.exceptions import RejectedRequestError
+from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting

 try:
    from litellm._version import version
@ -3050,6 +3051,13 @@ class ProxyConfig:
                "global_max_parallel_requests"
            ]

+        ## ALERTING ARGS ##
+        if "alerting_args" in _general_settings:
+            general_settings["alerting_args"] = _general_settings["alerting_args"]
+            proxy_logging_obj.slack_alerting_instance.update_values(
+                alerting_args=general_settings["alerting_args"],
+            )
+
    async def add_deployment(
        self,
        prisma_client: PrismaClient,
@ -8910,6 +8918,7 @@ async def budget_settings(
                field_description=field_info.description or "",
                field_value=db_budget_row_dict.get(field_name, None),
                stored_in_db=_stored_in_db,
+                field_default_value=field_info.default,
            )
            return_val.append(_response_obj)

@ -9826,6 +9835,149 @@ async def model_settings():
    return returned_list


+#### ALERTING MANAGEMENT ENDPOINTS ####
+
+
+@router.get(
+    "/alerting/settings",
+    description="Return the configurable alerting param, description, and current value",
+    tags=["alerting"],
+    dependencies=[Depends(user_api_key_auth)],
+    include_in_schema=False,
+)
+async def alerting_settings(
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    global proxy_logging_obj, prisma_client
+    """
+    Used by UI to generate 'alerting settings' page
+    {
+        field_name=field_name,
+        field_type=allowed_args[field_name]["type"], # string/int
+        field_description=field_info.description or "", # human-friendly description
+        field_value=general_settings.get(field_name, None), # example value
+    }
+    """
+    if prisma_client is None:
+        raise HTTPException(
+            status_code=400,
+            detail={"error": CommonProxyErrors.db_not_connected_error.value},
+        )
+
+    if user_api_key_dict.user_role != "proxy_admin":
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "{}, your role={}".format(
+                    CommonProxyErrors.not_allowed_access.value,
+                    user_api_key_dict.user_role,
+                )
+            },
+        )
+
+    ## get general settings from db
+    db_general_settings = await prisma_client.db.litellm_config.find_first(
+        where={"param_name": "general_settings"}
+    )
+
+    if db_general_settings is not None and db_general_settings.param_value is not None:
+        db_general_settings_dict = dict(db_general_settings.param_value)
+        alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {})  # type: ignore
+    else:
+        alerting_args_dict = {}
+
+    allowed_args = {
+        "daily_report_frequency": {"type": "Integer"},
+        "report_check_interval": {"type": "Integer"},
+        "budget_alert_ttl": {"type": "Integer"},
+        "outage_alert_ttl": {"type": "Integer"},
+        "region_outage_alert_ttl": {"type": "Integer"},
+        "minor_outage_alert_threshold": {"type": "Integer"},
+        "major_outage_alert_threshold": {"type": "Integer"},
+        "max_outage_alert_list_size": {"type": "Integer"},
+    }
+
+    _slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance
+    _slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump()
+
+    return_val = []
+
+    for field_name, field_info in SlackAlertingArgs.model_fields.items():
+        if field_name in allowed_args:
+
+            _stored_in_db: Optional[bool] = None
+            if field_name in alerting_args_dict:
+                _stored_in_db = True
+            else:
+                _stored_in_db = False
+
+            _response_obj = ConfigList(
+                field_name=field_name,
+                field_type=allowed_args[field_name]["type"],
+                field_description=field_info.description or "",
+                field_value=_slack_alerting_args_dict.get(field_name, None),
+                stored_in_db=_stored_in_db,
+                field_default_value=field_info.default,
+            )
+            return_val.append(_response_obj)
+    return return_val
+
+
+# @router.post(
+#     "/alerting/update",
+#     description="Update the slack alerting settings. Persist value in db.",
+#     tags=["alerting"],
+#     dependencies=[Depends(user_api_key_auth)],
+#     include_in_schema=False,
+# )
+# async def alerting_update(
+#     data: SlackAlertingArgs,
+#     user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+# ):
+#     """Allows updating slack alerting values. Used by UI."""
+#     global prisma_client
+#     if prisma_client is None:
+#         raise HTTPException(
+#             status_code=400,
+#             detail={"error": CommonProxyErrors.db_not_connected_error.value},
+#         )
+
+#     if user_api_key_dict.user_role != "proxy_admin":
+#         raise HTTPException(
+#             status_code=400,
+#             detail={"error": CommonProxyErrors.not_allowed_access.value},
+#         )
+
+#     ## get general settings from db
+#     db_general_settings = await prisma_client.db.litellm_config.find_first(
+#         where={"param_name": "general_settings"}
+#     )
+#     ### update value
+
+#     alerting_args_dict = {}
+#     if db_general_settings is None or db_general_settings.param_value is None:
+#         general_settings = {}
+#         alerting_args_dict = {}
+#     else:
+#         general_settings = dict(db_general_settings.param_value)
+#         _alerting_args_dict = general_settings.get("alerting_args", None)
+#         if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict):
+#             alerting_args_dict = _alerting_args_dict
+
+
+#     alerting_args_dict = data.model
+
+#     response = await prisma_client.db.litellm_config.upsert(
+#         where={"param_name": "general_settings"},
+#         data={
+#             "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)},  # type: ignore
+#             "update": {"param_value": json.dumps(general_settings)},  # type: ignore
+#         },
+#     )
+
+#     return response
+
+
 #### EXPERIMENTAL QUEUING ####
 async def _litellm_chat_completions_worker(data, user_api_key_dict):
    """
@ -10969,6 +11121,7 @@ async def get_config_list(
                field_description=field_info.description or "",
                field_value=general_settings.get(field_name, None),
                stored_in_db=_stored_in_db,
+                field_default_value=field_info.default,
            )
            return_val.append(_response_obj)

--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -100,7 +100,7 @@ class ProxyLogging:
            "new_model_added",
            "outage_alerts",
        ]
-        self.slack_alerting_instance = SlackAlerting(
+        self.slack_alerting_instance: SlackAlerting = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
            alert_types=self.alert_types,
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -576,7 +576,9 @@ async def test_outage_alerting_called(
    slack_alerting.update_values(llm_router=router)
    with patch.object(
        slack_alerting, "outage_alerts", new=AsyncMock()
-    ) as mock_send_alert:
+    ) as mock_outage_alert, patch.object(
+        slack_alerting, "region_outage_alerts", new=AsyncMock()
+    ) as mock_region_alert:
        try:
            await router.acompletion(
                model=model,
@ -586,7 +588,8 @@ async def test_outage_alerting_called(
        except Exception as e:
            pass

-        mock_send_alert.assert_called_once()
+        mock_outage_alert.assert_called_once()
+        mock_region_alert.assert_called_once()

    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
        for _ in range(6):
@ -600,6 +603,112 @@ async def test_outage_alerting_called(
                pass
        await asyncio.sleep(3)
        if error_code == 500 or error_code == 408:
+            assert (
+                mock_send_alert.assert_called_once()
+            )  # only model alert. region alert should only trigger for 2+ models in same region
+        else:
+            mock_send_alert.assert_not_called()
+
+
+@pytest.mark.parametrize(
+    "model, api_base, llm_provider, vertex_project, vertex_location",
+    [
+        ("gpt-3.5-turbo", None, "openai", None, None),
+        (
+            "azure/gpt-3.5-turbo",
+            "https://openai-gpt-4-test-v-1.openai.azure.com",
+            "azure",
+            None,
+            None,
+        ),
+        ("gemini-pro", None, "vertex_ai", "hardy-device-38811", "us-central1"),
+    ],
+)
+@pytest.mark.parametrize("error_code", [500, 408, 400])
+@pytest.mark.asyncio
+async def test_region_outage_alerting_called(
+    model, api_base, llm_provider, vertex_project, vertex_location, error_code
+):
+    """
+    If call fails, outage alert is called
+
+    If multiple calls fail, outage alert is sent
+    """
+    slack_alerting = SlackAlerting(
+        alerting=["webhook"], alert_types=["region_outage_alerts"]
+    )
+
+    litellm.callbacks = [slack_alerting]
+
+    error_to_raise: Optional[APIError] = None
+
+    if error_code == 400:
+        print("RAISING 400 ERROR CODE")
+        error_to_raise = litellm.BadRequestError(
+            message="this is a bad request",
+            model=model,
+            llm_provider=llm_provider,
+        )
+    elif error_code == 408:
+        print("RAISING 408 ERROR CODE")
+        error_to_raise = litellm.Timeout(
+            message="A timeout occurred", model=model, llm_provider=llm_provider
+        )
+    elif error_code == 500:
+        print("RAISING 500 ERROR CODE")
+        error_to_raise = litellm.ServiceUnavailableError(
+            message="API is unavailable",
+            model=model,
+            llm_provider=llm_provider,
+            response=httpx.Response(
+                status_code=503,
+                request=httpx.Request(
+                    method="completion",
+                    url="https://github.com/BerriAI/litellm",
+                ),
+            ),
+        )
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": model,
+                "litellm_params": {
+                    "model": model,
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_base": api_base,
+                    "vertex_location": vertex_location,
+                    "vertex_project": vertex_project,
+                },
+                "model_info": {"id": "1"},
+            },
+            {
+                "model_name": model,
+                "litellm_params": {
+                    "model": model,
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_base": api_base,
+                    "vertex_location": vertex_location,
+                    "vertex_project": "vertex_project-2",
+                },
+                "model_info": {"id": "2"},
+            },
+        ],
+        num_retries=0,
+        allowed_fails=100,
+    )
+
+    slack_alerting.update_values(llm_router=router)
+    with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
+        for idx in range(6):
+            if idx % 2 == 0:
+                deployment_id = "1"
+            else:
+                deployment_id = "2"
+            await slack_alerting.region_outage_alerts(
+                exception=error_to_raise, deployment_id=deployment_id  # type: ignore
+            )
+        if model == "gemini-pro" and (error_code == 500 or error_code == 408):
            mock_send_alert.assert_called_once()
        else:
            mock_send_alert.assert_not_called()
--- a/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx
+++ b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx
@ -0,0 +1,123 @@
+/**
+ * UI for controlling slack alerting settings
+ */
+import React, { useState, useEffect } from "react";
+import {
+  Table,
+  TableHead,
+  TableRow,
+  TableHeaderCell,
+  TableCell,
+  Button,
+  Icon,
+  Badge,
+  TableBody,
+  Text,
+} from "@tremor/react";
+import { InputNumber, message } from "antd";
+import { alertingSettingsCall, updateConfigFieldSetting } from "../networking";
+import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
+import DynamicForm from "./dynamic_form";
+interface alertingSettingsItem {
+  field_name: string;
+  field_type: string;
+  field_value: any;
+  field_default_value: any;
+  field_description: string;
+  stored_in_db: boolean | null;
+}
+
+interface AlertingSettingsProps {
+  accessToken: string | null;
+}
+
+const AlertingSettings: React.FC<AlertingSettingsProps> = ({ accessToken }) => {
+  const [alertingSettings, setAlertingSettings] = useState<
+    alertingSettingsItem[]
+  >([]);
+
+  console.log("INSIDE ALERTING SETTINGS");
+  useEffect(() => {
+    // get values
+    if (!accessToken) {
+      return;
+    }
+    alertingSettingsCall(accessToken).then((data) => {
+      setAlertingSettings(data);
+    });
+  }, [accessToken]);
+
+  const handleInputChange = (fieldName: string, newValue: any) => {
+    // Update the value in the state
+    const updatedSettings = alertingSettings.map((setting) =>
+      setting.field_name === fieldName
+        ? { ...setting, field_value: newValue }
+        : setting
+    );
+    setAlertingSettings(updatedSettings);
+  };
+
+  const handleSubmit = (formValues: Record<string, any>) => {
+    if (!accessToken) {
+      return;
+    }
+
+    let fieldValue = formValues;
+
+    if (fieldValue == null || fieldValue == undefined) {
+      return;
+    }
+
+    const initialFormValues: Record<string, any> = {};
+    alertingSettings.forEach((setting) => {
+      initialFormValues[setting.field_name] = setting.field_value;
+    });
+
+    // Merge initialFormValues with actual formValues
+    const mergedFormValues = { ...formValues, ...initialFormValues };
+    try {
+      updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues);
+      // update value in state
+      message.success("Wait 10s for proxy to update.");
+    } catch (error) {
+      // do something
+    }
+  };
+
+  const handleResetField = (fieldName: string, idx: number) => {
+    if (!accessToken) {
+      return;
+    }
+
+    try {
+      //   deleteConfigFieldSetting(accessToken, fieldName);
+      // update value in state
+
+      const updatedSettings = alertingSettings.map((setting) =>
+        setting.field_name === fieldName
+          ? {
+              ...setting,
+              stored_in_db: null,
+              field_value: setting.field_default_value,
+            }
+          : setting
+      );
+      console.log("INSIDE HANDLE RESET FIELD");
+      setAlertingSettings(updatedSettings);
+    } catch (error) {
+      // do something
+      console.log("ERROR OCCURRED!");
+    }
+  };
+
+  return (
+    <DynamicForm
+      alertingSettings={alertingSettings}
+      handleInputChange={handleInputChange}
+      handleResetField={handleResetField}
+      handleSubmit={handleSubmit}
+    />
+  );
+};
+
+export default AlertingSettings;
--- a/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx
+++ b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx
@ -0,0 +1,96 @@
+import React from "react";
+import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd";
+import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
+import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react";
+import Paragraph from "antd/es/typography/Paragraph";
+interface AlertingSetting {
+  field_name: string;
+  field_description: string;
+  field_type: string;
+  field_value: any;
+  stored_in_db: boolean | null;
+}
+
+interface DynamicFormProps {
+  alertingSettings: AlertingSetting[];
+  handleInputChange: (fieldName: string, newValue: any) => void;
+  handleResetField: (fieldName: string, index: number) => void;
+  handleSubmit: (formValues: Record<string, any>) => void;
+}
+
+const DynamicForm: React.FC<DynamicFormProps> = ({
+  alertingSettings,
+  handleInputChange,
+  handleResetField,
+  handleSubmit,
+}) => {
+  const [form] = Form.useForm();
+
+  const onFinish = () => {
+    const formData = form.getFieldsValue();
+    handleSubmit(formData);
+  };
+
+  return (
+    <Form form={form} onFinish={onFinish} labelAlign="left">
+      {alertingSettings.map((value, index) => (
+        <TableRow key={index}>
+          <TableCell>
+            <Text>{value.field_name}</Text>
+            <p
+              style={{
+                fontSize: "0.65rem",
+                color: "#808080",
+                fontStyle: "italic",
+              }}
+              className="mt-1"
+            >
+              {value.field_description}
+            </p>
+          </TableCell>
+          <Form.Item name={value.field_name}>
+            <TableCell>
+              {value.field_type === "Integer" ? (
+                <InputNumber
+                  step={1}
+                  value={value.field_value}
+                  onChange={(e) => handleInputChange(value.field_name, e)}
+                />
+              ) : (
+                <Input
+                  value={value.field_value}
+                  onChange={(e) => handleInputChange(value.field_name, e)}
+                />
+              )}
+            </TableCell>
+          </Form.Item>
+          <TableCell>
+            {value.stored_in_db == true ? (
+              <Badge icon={CheckCircleIcon} className="text-white">
+                In DB
+              </Badge>
+            ) : value.stored_in_db == false ? (
+              <Badge className="text-gray bg-white outline">In Config</Badge>
+            ) : (
+              <Badge className="text-gray bg-white outline">Not Set</Badge>
+            )}
+          </TableCell>
+          <TableCell>
+            <Icon
+              icon={TrashIcon}
+              color="red"
+              onClick={() => handleResetField(value.field_name, index)}
+            >
+              Reset
+            </Icon>
+          </TableCell>
+        </TableRow>
+      ))}
+      <div>
+        <Button2 htmlType="submit">Update Settings</Button2>
+      </div>
+    </Form>
+  );
+};
+
+export default DynamicForm;
--- a/ui/litellm-dashboard/src/components/general_settings.tsx
+++ b/ui/litellm-dashboard/src/components/general_settings.tsx
@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon
 import AddFallbacks from "./add_fallbacks";
 import openai from "openai";
 import Paragraph from "antd/es/skeleton/Paragraph";
-
 interface GeneralSettingsPageProps {
  accessToken: string | null;
  userRole: string | null;
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@ -207,6 +207,41 @@ export const budgetCreateCall = async (
    throw error;
  }
 };
+
+export const alertingSettingsCall = async (accessToken: String) => {
+  /**
+   * Get all configurable params for setting a model
+   */
+  try {
+    let url = proxyBaseUrl
+      ? `${proxyBaseUrl}/alerting/settings`
+      : `/alerting/settings`;
+
+    //message.info("Requesting model data");
+    const response = await fetch(url, {
+      method: "GET",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+    });
+
+    if (!response.ok) {
+      const errorData = await response.text();
+      message.error(errorData, 10);
+      throw new Error("Network response was not ok");
+    }
+
+    const data = await response.json();
+    //message.info("Received model data");
+    return data;
+    // Handle success - you might want to update some state or UI based on the created key
+  } catch (error) {
+    console.error("Failed to get callbacks:", error);
+    throw error;
+  }
+};
+
 export const keyCreateCall = async (
  accessToken: string,
  userID: string,
@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async (
  }
 };

-export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => {
+export const adminspendByProvider = async (
+  accessToken: String,
+  keyToken: String | null,
+  startTime: String | undefined,
+  endTime: String | undefined
+) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`;
+    let url = proxyBaseUrl
+      ? `${proxyBaseUrl}/global/spend/provider`
+      : `/global/spend/provider`;

    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String
  }
 };

-export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
+export const adminGlobalActivity = async (
+  accessToken: String,
+  startTime: String | undefined,
+  endTime: String | undefined
+) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`;
+    let url = proxyBaseUrl
+      ? `${proxyBaseUrl}/global/activity`
+      : `/global/activity`;

    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String
  }
 };

-
-export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
+export const adminGlobalActivityPerModel = async (
+  accessToken: String,
+  startTime: String | undefined,
+  endTime: String | undefined
+) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`;
+    let url = proxyBaseUrl
+      ? `${proxyBaseUrl}/global/activity/model`
+      : `/global/activity/model`;

    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime
  }
 };

-
 export const adminTopModelsCall = async (accessToken: String) => {
  try {
    let url = proxyBaseUrl
--- a/ui/litellm-dashboard/src/components/settings.tsx
+++ b/ui/litellm-dashboard/src/components/settings.tsx
@ -31,7 +31,7 @@ import {
 } from "./networking";
 import { Modal, Form, Input, Select, Button as Button2, message } from "antd";
 import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
-
+import AlertingSettings from "./alerting/alerting_settings";
 interface SettingsPageProps {
  accessToken: string | null;
  userRole: string | null;
@ -117,6 +117,7 @@ const Settings: React.FC<SettingsPageProps> = ({
    db_exceptions: "Database Exceptions (Read/Write)",
    daily_reports: "Weekly/Monthly Spend Reports",
    outage_alerts: "Outage Alerts",
+    region_outage_alerts: "Region Outage Alerts",
  };

  useEffect(() => {
@ -365,7 +366,8 @@ const Settings: React.FC<SettingsPageProps> = ({
        <TabGroup>
          <TabList variant="line" defaultValue="1">
            <Tab value="1">Logging Callbacks</Tab>
-            <Tab value="2">Alerting</Tab>
+            <Tab value="2">Alerting Types</Tab>
+            <Tab value="2">Alerting Settings</Tab>
          </TabList>
          <TabPanels>
            <TabPanel>
@ -496,6 +498,9 @@ const Settings: React.FC<SettingsPageProps> = ({
                </Button>
              </Card>
            </TabPanel>
+            <TabPanel>
+              <AlertingSettings accessToken={accessToken} />
+            </TabPanel>
          </TabPanels>
        </TabGroup>
      </Grid>