feat(ui): allow admin to configure slack alerting thresholds on ui

2025-04-26 11:14:04 +00:00 · 2024-05-25 21:01:19 -07:00 · 2024-05-25 21:01:19 -07:00 · a9a1447513
commit a9a1447513
parent c2f19d631e
9 changed files with 492 additions and 24 deletions
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -10,7 +10,7 @@ import asyncio, time
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from enum import Enum
 from datetime import datetime as dt, timedelta, timezone
 from litellm.integrations.custom_logger import CustomLogger
@ -60,18 +60,55 @@ class LiteLLMBase(BaseModel):
            return self.dict()
 class SlackAlertingArgsEnum(Enum):
    daily_report_frequency: int = 12 * 60 * 60
    report_check_interval: int = 5 * 60
    budget_alert_ttl: int = 24 * 60 * 60
    outage_alert_ttl: int = 1 * 60
    region_outage_alert_ttl: int = 1 * 60
    minor_outage_alert_threshold: int = 1 * 5
    major_outage_alert_threshold: int = 1 * 10
    max_outage_alert_list_size: int = 1 * 10
 class SlackAlertingArgs(LiteLLMBase):
-    default_daily_report_frequency: int = 12 * 60 * 60  # 12 hours
+    daily_report_frequency: int = Field(
-    daily_report_frequency: int = int(
+        default=int(
-        os.getenv("SLACK_DAILY_REPORT_FREQUENCY", default_daily_report_frequency)
+            os.getenv(
                "SLACK_DAILY_REPORT_FREQUENCY",
                SlackAlertingArgsEnum.daily_report_frequency.value,
            )
        ),
        description="Frequency of receiving deployment latency/failure reports. Default is 12hours. Value is in seconds.",
    )
-    report_check_interval: int = 5 * 60  # 5 minutes
+    report_check_interval: int = Field(
-    budget_alert_ttl: int = 24 * 60 * 60  # 24 hours
+        default=SlackAlertingArgsEnum.report_check_interval.value,
-    outage_alert_ttl: int = 1 * 60  # 1 minute ttl
+        description="Frequency of checking cache if report should be sent. Background process. Default is once per hour. Value is in seconds.",
-    region_outage_alert_ttl: int = 1 * 60  # 1 minute ttl
+    )  # 5 minutes
-    minor_outage_alert_threshold: int = 5
+    budget_alert_ttl: int = Field(
-    major_outage_alert_threshold: int = 10
+        default=SlackAlertingArgsEnum.budget_alert_ttl.value,
-    max_outage_alert_list_size: int = 10  # prevent memory leak
+        description="Cache ttl for budgets alerts. Prevents spamming same alert, each time budget is crossed. Value is in seconds.",
    )  # 24 hours
    outage_alert_ttl: int = Field(
        default=SlackAlertingArgsEnum.outage_alert_ttl.value,
        description="Cache ttl for model outage alerts. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
    )  # 1 minute ttl
    region_outage_alert_ttl: int = Field(
        default=SlackAlertingArgsEnum.region_outage_alert_ttl.value,
        description="Cache ttl for provider-region based outage alerts. Alert sent if 2+ models in same region report errors. Sets time-window for errors. Default is 1 minute. Value is in seconds.",
    )  # 1 minute ttl
    minor_outage_alert_threshold: int = Field(
        default=SlackAlertingArgsEnum.minor_outage_alert_threshold.value,
        description="The number of errors that count as a model/region minor outage. ('400' error code is not counted).",
    )
    major_outage_alert_threshold: int = Field(
        default=SlackAlertingArgsEnum.major_outage_alert_threshold.value,
        description="The number of errors that countas a model/region major outage. ('400' error code is not counted).",
    )
    max_outage_alert_list_size: int = Field(
        default=SlackAlertingArgsEnum.max_outage_alert_list_size.value,
        description="Maximum number of errors to store in cache. For a given model/region. Prevents memory leaks.",
    )  # prevent memory leak
 class DeploymentMetrics(LiteLLMBase):
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -800,6 +800,7 @@ class ConfigList(LiteLLMBase):
    field_description: str
    field_value: Any
    stored_in_db: Optional[bool]
    field_default_value: Any
 class ConfigGeneralSettings(LiteLLMBase):
@ -877,7 +878,9 @@ class ConfigGeneralSettings(LiteLLMBase):
        None,
        description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
    )
-
+    alerting_args: Optional[Dict] = Field(
        None, description="Controllable params for slack alerting - e.g. ttl in cache."
    )
    alerting_threshold: Optional[int] = Field(
        None,
        description="sends alerts if requests hang for 5min+",
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -125,6 +125,7 @@ from litellm.proxy.auth.auth_checks import (
 )
 from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
 from litellm.exceptions import RejectedRequestError
 from litellm.integrations.slack_alerting import SlackAlertingArgs, SlackAlerting
 try:
    from litellm._version import version
@ -3049,6 +3050,13 @@ class ProxyConfig:
                "global_max_parallel_requests"
            ]
        ## ALERTING ARGS ##
        if "alerting_args" in _general_settings:
            general_settings["alerting_args"] = _general_settings["alerting_args"]
            proxy_logging_obj.slack_alerting_instance.update_values(
                alerting_args=general_settings["alerting_args"],
            )
    async def add_deployment(
        self,
        prisma_client: PrismaClient,
@ -8894,6 +8902,7 @@ async def budget_settings(
                field_description=field_info.description or "",
                field_value=db_budget_row_dict.get(field_name, None),
                stored_in_db=_stored_in_db,
                field_default_value=field_info.default,
            )
            return_val.append(_response_obj)
@ -9791,6 +9800,149 @@ async def model_settings():
    return returned_list
 #### ALERTING MANAGEMENT ENDPOINTS ####
@router.get(
    "/alerting/settings",
    description="Return the configurable alerting param, description, and current value",
    tags=["alerting"],
    dependencies=[Depends(user_api_key_auth)],
    include_in_schema=False,
 )
 async def alerting_settings(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    global proxy_logging_obj, prisma_client
    """
    Used by UI to generate 'alerting settings' page
    {
        field_name=field_name,
        field_type=allowed_args[field_name]["type"], # string/int
        field_description=field_info.description or "", # human-friendly description
        field_value=general_settings.get(field_name, None), # example value
    }
    """
    if prisma_client is None:
        raise HTTPException(
            status_code=400,
            detail={"error": CommonProxyErrors.db_not_connected_error.value},
        )
    if user_api_key_dict.user_role != "proxy_admin":
        raise HTTPException(
            status_code=400,
            detail={
                "error": "{}, your role={}".format(
                    CommonProxyErrors.not_allowed_access.value,
                    user_api_key_dict.user_role,
                )
            },
        )
    ## get general settings from db
    db_general_settings = await prisma_client.db.litellm_config.find_first(
        where={"param_name": "general_settings"}
    )
    if db_general_settings is not None and db_general_settings.param_value is not None:
        db_general_settings_dict = dict(db_general_settings.param_value)
        alerting_args_dict: dict = db_general_settings_dict.get("alerting_args", {})  # type: ignore
    else:
        alerting_args_dict = {}
    allowed_args = {
        "daily_report_frequency": {"type": "Integer"},
        "report_check_interval": {"type": "Integer"},
        "budget_alert_ttl": {"type": "Integer"},
        "outage_alert_ttl": {"type": "Integer"},
        "region_outage_alert_ttl": {"type": "Integer"},
        "minor_outage_alert_threshold": {"type": "Integer"},
        "major_outage_alert_threshold": {"type": "Integer"},
        "max_outage_alert_list_size": {"type": "Integer"},
    }
    _slack_alerting: SlackAlerting = proxy_logging_obj.slack_alerting_instance
    _slack_alerting_args_dict = _slack_alerting.alerting_args.model_dump()
    return_val = []
    for field_name, field_info in SlackAlertingArgs.model_fields.items():
        if field_name in allowed_args:
            _stored_in_db: Optional[bool] = None
            if field_name in alerting_args_dict:
                _stored_in_db = True
            else:
                _stored_in_db = False
            _response_obj = ConfigList(
                field_name=field_name,
                field_type=allowed_args[field_name]["type"],
                field_description=field_info.description or "",
                field_value=_slack_alerting_args_dict.get(field_name, None),
                stored_in_db=_stored_in_db,
                field_default_value=field_info.default,
            )
            return_val.append(_response_obj)
    return return_val
 # @router.post(
 #     "/alerting/update",
 #     description="Update the slack alerting settings. Persist value in db.",
 #     tags=["alerting"],
 #     dependencies=[Depends(user_api_key_auth)],
 #     include_in_schema=False,
 # )
 # async def alerting_update(
 #     data: SlackAlertingArgs,
 #     user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 # ):
 #     """Allows updating slack alerting values. Used by UI."""
 #     global prisma_client
 #     if prisma_client is None:
 #         raise HTTPException(
 #             status_code=400,
 #             detail={"error": CommonProxyErrors.db_not_connected_error.value},
 #         )
 #     if user_api_key_dict.user_role != "proxy_admin":
 #         raise HTTPException(
 #             status_code=400,
 #             detail={"error": CommonProxyErrors.not_allowed_access.value},
 #         )
 #     ## get general settings from db
 #     db_general_settings = await prisma_client.db.litellm_config.find_first(
 #         where={"param_name": "general_settings"}
 #     )
 #     ### update value
 #     alerting_args_dict = {}
 #     if db_general_settings is None or db_general_settings.param_value is None:
 #         general_settings = {}
 #         alerting_args_dict = {}
 #     else:
 #         general_settings = dict(db_general_settings.param_value)
 #         _alerting_args_dict = general_settings.get("alerting_args", None)
 #         if _alerting_args_dict is not None and isinstance(_alerting_args_dict, dict):
 #             alerting_args_dict = _alerting_args_dict
 #     alerting_args_dict = data.model
 #     response = await prisma_client.db.litellm_config.upsert(
 #         where={"param_name": "general_settings"},
 #         data={
 #             "create": {"param_name": "general_settings", "param_value": json.dumps(general_settings)},  # type: ignore
 #             "update": {"param_value": json.dumps(general_settings)},  # type: ignore
 #         },
 #     )
 #     return response
 #### EXPERIMENTAL QUEUING ####
 async def _litellm_chat_completions_worker(data, user_api_key_dict):
    """
@ -10934,6 +11086,7 @@ async def get_config_list(
                field_description=field_info.description or "",
                field_value=general_settings.get(field_name, None),
                stored_in_db=_stored_in_db,
                field_default_value=field_info.default,
            )
            return_val.append(_response_obj)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -92,7 +92,7 @@ class ProxyLogging:
            "new_model_added",
            "outage_alerts",
        ]
-        self.slack_alerting_instance = SlackAlerting(
+        self.slack_alerting_instance: SlackAlerting = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
            alerting=self.alerting,
            alert_types=self.alert_types,
--- a/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx
+++ b/ui/litellm-dashboard/src/components/alerting/alerting_settings.tsx
@ -0,0 +1,123 @@
 /**
 * UI for controlling slack alerting settings
 */
 import React, { useState, useEffect } from "react";
 import {
  Table,
  TableHead,
  TableRow,
  TableHeaderCell,
  TableCell,
  Button,
  Icon,
  Badge,
  TableBody,
  Text,
 } from "@tremor/react";
 import { InputNumber, message } from "antd";
 import { alertingSettingsCall, updateConfigFieldSetting } from "../networking";
 import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
 import DynamicForm from "./dynamic_form";
 interface alertingSettingsItem {
  field_name: string;
  field_type: string;
  field_value: any;
  field_default_value: any;
  field_description: string;
  stored_in_db: boolean | null;
 }
 interface AlertingSettingsProps {
  accessToken: string | null;
 }
 const AlertingSettings: React.FC<AlertingSettingsProps> = ({ accessToken }) => {
  const [alertingSettings, setAlertingSettings] = useState<
    alertingSettingsItem[]
  >([]);
  console.log("INSIDE ALERTING SETTINGS");
  useEffect(() => {
    // get values
    if (!accessToken) {
      return;
    }
    alertingSettingsCall(accessToken).then((data) => {
      setAlertingSettings(data);
    });
  }, [accessToken]);
  const handleInputChange = (fieldName: string, newValue: any) => {
    // Update the value in the state
    const updatedSettings = alertingSettings.map((setting) =>
      setting.field_name === fieldName
        ? { ...setting, field_value: newValue }
        : setting
    );
    setAlertingSettings(updatedSettings);
  };
  const handleSubmit = (formValues: Record<string, any>) => {
    if (!accessToken) {
      return;
    }
    let fieldValue = formValues;
    if (fieldValue == null || fieldValue == undefined) {
      return;
    }
    const initialFormValues: Record<string, any> = {};
    alertingSettings.forEach((setting) => {
      initialFormValues[setting.field_name] = setting.field_value;
    });
    // Merge initialFormValues with actual formValues
    const mergedFormValues = { ...formValues, ...initialFormValues };
    try {
      updateConfigFieldSetting(accessToken, "alerting_args", mergedFormValues);
      // update value in state
      message.success("Wait 10s for proxy to update.");
    } catch (error) {
      // do something
    }
  };
  const handleResetField = (fieldName: string, idx: number) => {
    if (!accessToken) {
      return;
    }
    try {
      //   deleteConfigFieldSetting(accessToken, fieldName);
      // update value in state
      const updatedSettings = alertingSettings.map((setting) =>
        setting.field_name === fieldName
          ? {
              ...setting,
              stored_in_db: null,
              field_value: setting.field_default_value,
            }
          : setting
      );
      console.log("INSIDE HANDLE RESET FIELD");
      setAlertingSettings(updatedSettings);
    } catch (error) {
      // do something
      console.log("ERROR OCCURRED!");
    }
  };
  return (
    <DynamicForm
      alertingSettings={alertingSettings}
      handleInputChange={handleInputChange}
      handleResetField={handleResetField}
      handleSubmit={handleSubmit}
    />
  );
 };
 export default AlertingSettings;
--- a/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx
+++ b/ui/litellm-dashboard/src/components/alerting/dynamic_form.tsx
@ -0,0 +1,96 @@
 import React from "react";
 import { Form, Input, InputNumber, Row, Col, Button as Button2 } from "antd";
 import { TrashIcon, CheckCircleIcon } from "@heroicons/react/outline";
 import { Button, Badge, Icon, Text, TableRow, TableCell } from "@tremor/react";
 import Paragraph from "antd/es/typography/Paragraph";
 interface AlertingSetting {
  field_name: string;
  field_description: string;
  field_type: string;
  field_value: any;
  stored_in_db: boolean | null;
 }
 interface DynamicFormProps {
  alertingSettings: AlertingSetting[];
  handleInputChange: (fieldName: string, newValue: any) => void;
  handleResetField: (fieldName: string, index: number) => void;
  handleSubmit: (formValues: Record<string, any>) => void;
 }
 const DynamicForm: React.FC<DynamicFormProps> = ({
  alertingSettings,
  handleInputChange,
  handleResetField,
  handleSubmit,
 }) => {
  const [form] = Form.useForm();
  const onFinish = () => {
    const formData = form.getFieldsValue();
    handleSubmit(formData);
  };
  return (
    <Form form={form} onFinish={onFinish} labelAlign="left">
      {alertingSettings.map((value, index) => (
        <TableRow key={index}>
          <TableCell>
            <Text>{value.field_name}</Text>
            <p
              style={{
                fontSize: "0.65rem",
                color: "#808080",
                fontStyle: "italic",
              }}
              className="mt-1"
            >
              {value.field_description}
            </p>
          </TableCell>
          <Form.Item name={value.field_name}>
            <TableCell>
              {value.field_type === "Integer" ? (
                <InputNumber
                  step={1}
                  value={value.field_value}
                  onChange={(e) => handleInputChange(value.field_name, e)}
                />
              ) : (
                <Input
                  value={value.field_value}
                  onChange={(e) => handleInputChange(value.field_name, e)}
                />
              )}
            </TableCell>
          </Form.Item>
          <TableCell>
            {value.stored_in_db == true ? (
              <Badge icon={CheckCircleIcon} className="text-white">
                In DB
              </Badge>
            ) : value.stored_in_db == false ? (
              <Badge className="text-gray bg-white outline">In Config</Badge>
            ) : (
              <Badge className="text-gray bg-white outline">Not Set</Badge>
            )}
          </TableCell>
          <TableCell>
            <Icon
              icon={TrashIcon}
              color="red"
              onClick={() => handleResetField(value.field_name, index)}
            >
              Reset
            </Icon>
          </TableCell>
        </TableRow>
      ))}
      <div>
        <Button2 htmlType="submit">Update Settings</Button2>
      </div>
    </Form>
  );
 };
 export default DynamicForm;
--- a/ui/litellm-dashboard/src/components/general_settings.tsx
+++ b/ui/litellm-dashboard/src/components/general_settings.tsx
@ -63,7 +63,6 @@ import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/compon
 import AddFallbacks from "./add_fallbacks";
 import openai from "openai";
 import Paragraph from "antd/es/skeleton/Paragraph";
 interface GeneralSettingsPageProps {
  accessToken: string | null;
  userRole: string | null;
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@ -207,6 +207,41 @@ export const budgetCreateCall = async (
    throw error;
  }
 };
 export const alertingSettingsCall = async (accessToken: String) => {
  /**
   * Get all configurable params for setting a model
   */
  try {
    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/alerting/settings`
      : `/alerting/settings`;
    //message.info("Requesting model data");
    const response = await fetch(url, {
      method: "GET",
      headers: {
        Authorization: `Bearer ${accessToken}`,
        "Content-Type": "application/json",
      },
    });
    if (!response.ok) {
      const errorData = await response.text();
      message.error(errorData, 10);
      throw new Error("Network response was not ok");
    }
    const data = await response.json();
    //message.info("Received model data");
    return data;
    // Handle success - you might want to update some state or UI based on the created key
  } catch (error) {
    console.error("Failed to get callbacks:", error);
    throw error;
  }
 };
 export const keyCreateCall = async (
  accessToken: string,
  userID: string,
@ -995,9 +1030,16 @@ export const adminTopEndUsersCall = async (
  }
 };
-export const adminspendByProvider = async (accessToken: String, keyToken: String | null, startTime: String | undefined, endTime: String | undefined) => {
+export const adminspendByProvider = async (
  accessToken: String,
  keyToken: String | null,
  startTime: String | undefined,
  endTime: String | undefined
 ) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/spend/provider` : `/global/spend/provider`;
+    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/spend/provider`
      : `/global/spend/provider`;
    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1036,9 +1078,15 @@ export const adminspendByProvider = async (accessToken: String, keyToken: String
  }
 };
-export const adminGlobalActivity = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
+export const adminGlobalActivity = async (
  accessToken: String,
  startTime: String | undefined,
  endTime: String | undefined
 ) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity` : `/global/activity`;
+    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/activity`
      : `/global/activity`;
    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1071,10 +1119,15 @@ export const adminGlobalActivity = async (accessToken: String, startTime: String
  }
 };
-
+export const adminGlobalActivityPerModel = async (
-export const adminGlobalActivityPerModel = async (accessToken: String, startTime: String | undefined, endTime: String | undefined) => {
+  accessToken: String,
  startTime: String | undefined,
  endTime: String | undefined
 ) => {
  try {
-    let url = proxyBaseUrl ? `${proxyBaseUrl}/global/activity/model` : `/global/activity/model`;
+    let url = proxyBaseUrl
      ? `${proxyBaseUrl}/global/activity/model`
      : `/global/activity/model`;
    if (startTime && endTime) {
      url += `?start_date=${startTime}&end_date=${endTime}`;
@ -1107,7 +1160,6 @@ export const adminGlobalActivityPerModel = async (accessToken: String, startTime
  }
 };
 export const adminTopModelsCall = async (accessToken: String) => {
  try {
    let url = proxyBaseUrl
--- a/ui/litellm-dashboard/src/components/settings.tsx
+++ b/ui/litellm-dashboard/src/components/settings.tsx
@ -31,7 +31,7 @@ import {
 } from "./networking";
 import { Modal, Form, Input, Select, Button as Button2, message } from "antd";
 import StaticGenerationSearchParamsBailoutProvider from "next/dist/client/components/static-generation-searchparams-bailout-provider";
-
+import AlertingSettings from "./alerting/alerting_settings";
 interface SettingsPageProps {
  accessToken: string | null;
  userRole: string | null;
@ -117,6 +117,7 @@ const Settings: React.FC<SettingsPageProps> = ({
    db_exceptions: "Database Exceptions (Read/Write)",
    daily_reports: "Weekly/Monthly Spend Reports",
    outage_alerts: "Outage Alerts",
    region_outage_alerts: "Region Outage Alerts",
  };
  useEffect(() => {
@ -365,7 +366,8 @@ const Settings: React.FC<SettingsPageProps> = ({
        <TabGroup>
          <TabList variant="line" defaultValue="1">
            <Tab value="1">Logging Callbacks</Tab>
-            <Tab value="2">Alerting</Tab>
+            <Tab value="2">Alerting Types</Tab>
            <Tab value="2">Alerting Settings</Tab>
          </TabList>
          <TabPanels>
            <TabPanel>
@ -496,6 +498,9 @@ const Settings: React.FC<SettingsPageProps> = ({
                </Button>
              </Card>
            </TabPanel>
            <TabPanel>
              <AlertingSettings accessToken={accessToken} />
            </TabPanel>
          </TabPanels>
        </TabGroup>
      </Grid>